Page MenuHomePhabricator
Authored By
bmansurov
Sep 18 2017, 3:32 PM
Size
1 KB
Referenced Files
None
Subscribers
None
# Downloads and creates an HTML of the capitals cities of the world.
# pip install bs4
# python articles.py > capitals.html
import urllib.request
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_countries_by_national_capital,'\
'_largest_and_second-largest_cities'
RESTBASE_ENDPOINT = 'https://en.wikipedia.org/api/rest_v1/page/html/'
def fetch_article(title):
with urllib.request.urlopen(
RESTBASE_ENDPOINT + title) as response:
html = BeautifulSoup(response.read(), 'html.parser')
return html
def main():
html = False
with urllib.request.urlopen(URL) as response:
html = BeautifulSoup(response.read(), 'html.parser')
if not html:
print('Could not open the URL.')
exit(1)
article_soups = []
rows = html.find('table', class_='wikitable').find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) > 2:
cell = cells[1]
link = cell.find('a', href=True)
if link:
title = link['href'].split('/')[-1]
article_soups.append(
(title, fetch_article(title))
)
first_title, first_soup = article_soups[0]
base = first_soup.find('base')
if base and base['href'] and base['href'].startswith('//'):
base['href'] = 'https:' + base['href']
first_soup.body.insert(
0, BeautifulSoup('<h1>' + first_title + '</h1>', 'html.parser'))
for title, soup in article_soups[1:]:
first_soup.body.append(
BeautifulSoup('<h1>' + title + '</h1>', 'html.parser'))
for element in soup.body:
first_soup.body.append(element)
print(first_soup.prettify())
if __name__ == '__main__':
main()

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4942174
Default Alt Text
T175853 (1 KB)

Event Timeline