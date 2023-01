#!/usr/bin/env python

import

csv

import

hashlib

import

json

import

os

from

typing

import

Iterable

from

bs4

import

BeautifulSoup

from

requests

import

Session

BASE_URL

=

"https://connect.ed-diamond.com"

URL_MAPPING

=

{

"glmf"

:

"gnu-linux-magazine"

,

"misc"

:

"misc"

,

"hackable"

:

"hackable"

,

"lp"

:

"linux-pratique"

,

}

USE_CACHE

=

True

CACHE_DIR

=

"cache"

def

get_soup

(

url

:

str

,

session

:

Session

)

->

BeautifulSoup

:

print

(

url

)

"""Get a soup from an URL"""

html

=

None

if

USE_CACHE

:

# Get the file from the cache

# SHA1 is good enough for us

url_hash

=

hashlib

.

sha1

(

url

.

encode

())

.

hexdigest

()

cache_entry

=

CACHE_DIR

+

"/"

+

url_hash

+

".html"

try

:

with

open

(

cache_entry

)

as

f

:

html

=

f

.

read

()

except

FileNotFoundError

:

pass

# Cache miss, download the file

if

html

is

None

:

html

=

session

.

get

(

url

)

.

text

if

USE_CACHE

:

# Register the file in the cache

os

.

makedirs

(

CACHE_DIR

,

exist_ok

=

True

)

with

open

(

cache_entry

,

"w"

)

as

f

:

f

.

write

(

html

)

return

BeautifulSoup

(

html

,

features

=

"html.parser"

)

def

get_issues

(

magazine

:

str

,

session

:

Session

)

->

Iterable

:

# Get the soup of the magazine page

magazine_soup

=

get_soup

(

BASE_URL

+

"/"

+

URL_MAPPING

[

magazine

],

session

)

# Get the links the the issues

for

node

in

magazine_soup

.

select

(

".region-sidebar-second .context-depending-bgcolor-link a"

# ".view-display-id-block_3 > div:nth-child(1) > div a"

):

yield

node

.

attrs

[

"href"

]

def

get_articles

(

issue_url

:

str

,

session

:

Session

)

->

Iterable

:

issue_soup

=

get_soup

(

BASE_URL

+

issue_url

,

session

)

# Some issues have the articles in the HTML and

# others in JS. Maybe an anti-crawler protection?

# Parse the HTML embedded in the JS if we found any

#

# Search for the issue content in the script tags

for

node

in

issue_soup

.

select

(

'script[type="application/vnd.drupal-ajax"]'

):

if

"views_block__view_articles_list_block_7"

in

node

.

attrs

.

get

(

"data-big-pipe-replacement-for-placeholder-with-id"

,

""

):

# Tag found, get a soup from the embedded content

issue_soup

=

BeautifulSoup

(

json

.

loads

(

node

.

text

)[

0

][

"data"

],

features

=

"html.parser"

)

break

# Search the articles

for

node

in

issue_soup

.

select

(

".view-display-id-block_7 .view_inner"

):

# Extract infos from articles

url_node

=

node

.

select

(

".views-field-title a"

)[

0

]

url

=

url_node

.

attrs

[

"href"

]

title_node

=

url_node

.

select

(

".views-field-title a span"

)[

0

]

title

=

title_node

.

text

paywall

=

"is_locked_icon"

in

title_node

.

attrs

[

"class"

]

author

=

node

.

select

(

".views-field-field-ct-article-authors a"

)[

0

]

.

text

yield

title

,

author

,

paywall

,

url

def

main

()

->

None

:

# Create an HTTP sessions

# Make things fasta!

session

=

Session

()

# Open the output CSV file

with

open

(

"eddiamond.csv"

,

"w"

)

as

f

:

# Create the csv writer

csv_writer

=

csv

.

DictWriter

(

f

,

(

"magazine"

,

"issue"

,

"issue_url"

,

"title"

,

"author"

,

"paywall"

,

"url"

,

),

)

csv_writer

.

writeheader

()

# Get all articles of each issues of each magazines

for

magazine

in

URL_MAPPING

:

print

(

magazine

)

for

issue_url

in

get_issues

(

magazine

,

session

):

for

(

article_title

,

article_author

,

article_paywall

,

article_url

,

)

in

get_articles

(

issue_url

,

session

):

issue_name

=

issue_url

.

rsplit

(

"/"

,

1

)[

-

1

]

csv_writer

.

writerow

(

{

"magazine"

:

magazine

,

"issue"

:

issue_name

,

"issue_url"

:

BASE_URL

+

issue_url

,

"title"

:

article_title

,

"author"

:

article_author

,

"paywall"

:

article_paywall

,

"url"

:

BASE_URL

+

article_url

,

}

)

if

__name__

==

"__main__"

:

main

()