Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Paste
P6015
T175853
Active
Public
Actions
Authored by
•
bmansurov
on Sep 18 2017, 3:32 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Referenced Files
F9625657: T175853
Sep 18 2017, 3:32 PM
2017-09-18 15:32:39 (UTC+0)
Subscribers
None
# Downloads and creates an HTML of the capitals cities of the world.
# pip install bs4
# python articles.py > capitals.html
import
urllib.request
from
bs4
import
BeautifulSoup
URL
=
'https://en.wikipedia.org/wiki/List_of_countries_by_national_capital,'
\
'_largest_and_second-largest_cities'
RESTBASE_ENDPOINT
=
'https://en.wikipedia.org/api/rest_v1/page/html/'
def
fetch_article
(
title
):
with
urllib
.
request
.
urlopen
(
RESTBASE_ENDPOINT
+
title
)
as
response
:
html
=
BeautifulSoup
(
response
.
read
(),
'html.parser'
)
return
html
def
main
():
html
=
False
with
urllib
.
request
.
urlopen
(
URL
)
as
response
:
html
=
BeautifulSoup
(
response
.
read
(),
'html.parser'
)
if
not
html
:
print
(
'Could not open the URL.'
)
exit
(
1
)
article_soups
=
[]
rows
=
html
.
find
(
'table'
,
class_
=
'wikitable'
)
.
find_all
(
'tr'
)
for
row
in
rows
:
cells
=
row
.
find_all
(
'td'
)
if
len
(
cells
)
>
2
:
cell
=
cells
[
1
]
link
=
cell
.
find
(
'a'
,
href
=
True
)
if
link
:
title
=
link
[
'href'
]
.
split
(
'/'
)[
-
1
]
article_soups
.
append
(
(
title
,
fetch_article
(
title
))
)
first_title
,
first_soup
=
article_soups
[
0
]
base
=
first_soup
.
find
(
'base'
)
if
base
and
base
[
'href'
]
and
base
[
'href'
]
.
startswith
(
'//'
):
base
[
'href'
]
=
'https:'
+
base
[
'href'
]
first_soup
.
body
.
insert
(
0
,
BeautifulSoup
(
'<h1>'
+
first_title
+
'</h1>'
,
'html.parser'
))
for
title
,
soup
in
article_soups
[
1
:]:
first_soup
.
body
.
append
(
BeautifulSoup
(
'<h1>'
+
title
+
'</h1>'
,
'html.parser'
))
for
element
in
soup
.
body
:
first_soup
.
body
.
append
(
element
)
print
(
first_soup
.
prettify
())
if
__name__
==
'__main__'
:
main
()
Event Timeline
•
bmansurov
created this paste.
Sep 18 2017, 3:32 PM
2017-09-18 15:32:39 (UTC+0)
•
bmansurov
mentioned this in
T175853: [Spike 16hr] Investigate the ability of Python wrapped headless Chromium to render large books
.
Sep 18 2017, 9:12 PM
2017-09-18 21:12:44 (UTC+0)
Log In to Comment