T175853
No One
Actions

Authored By

	• bmansurov
	Sep 18 2017, 3:32 PM

Size

1 KB

Referenced Files

None

Subscribers

None

T175853
View Options

	# Downloads and creates an HTML of the capitals cities of the world.
	# pip install bs4
	# python articles.py > capitals.html


	import urllib.request

	from bs4 import BeautifulSoup


	URL = 'https://en.wikipedia.org/wiki/List_of_countries_by_national_capital,'\
	'_largest_and_second-largest_cities'
	RESTBASE_ENDPOINT = 'https://en.wikipedia.org/api/rest_v1/page/html/'


	def fetch_article(title):
	with urllib.request.urlopen(
	RESTBASE_ENDPOINT + title) as response:
	html = BeautifulSoup(response.read(), 'html.parser')
	return html


	def main():
	html = False
	with urllib.request.urlopen(URL) as response:
	html = BeautifulSoup(response.read(), 'html.parser')

	if not html:
	print('Could not open the URL.')
	exit(1)

	article_soups = []
	rows = html.find('table', class_='wikitable').find_all('tr')
	for row in rows:
	cells = row.find_all('td')
	if len(cells) > 2:
	cell = cells[1]
	link = cell.find('a', href=True)
	if link:
	title = link['href'].split('/')[-1]
	article_soups.append(
	(title, fetch_article(title))
	)

	first_title, first_soup = article_soups[0]
	base = first_soup.find('base')
	if base and base['href'] and base['href'].startswith('//'):
	base['href'] = 'https:' + base['href']
	first_soup.body.insert(
	0, BeautifulSoup('<h1>' + first_title + '</h1>', 'html.parser'))

	for title, soup in article_soups[1:]:
	first_soup.body.append(
	BeautifulSoup('<h1>' + title + '</h1>', 'html.parser'))
	for element in soup.body:
	first_soup.body.append(element)

	print(first_soup.prettify())


	if __name__ == '__main__':
	main()