In [1]: import requests In [2]: import io In [3]: from pdfminer.high_level import extract_text In [4]: valid = 0 ...: invalid = 0 ...: for i in range(20): ...: random = requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title') ...: title = random.json()['items'][0]['title'] ...: print('Random page title: {}'.format(title)) ...: pdf = requests.get('https://en.wikipedia.org/api/rest_v1/page/pdf/{}'.format(title)) ...: file = io.BytesIO(pdf.content) ...: try: ...: extract_text(file) ...: valid += 1 ...: print('Valid pdf: {}'.format(title)) ...: except: ...: invalid += 1 ...: print('Invalid pdf: {}'.format(title)) ...: Random page title: John_McGrath_(Irish_footballer) Valid pdf: John_McGrath_(Irish_footballer) Random page title: Kishindih_District Invalid pdf: Kishindih_District Random page title: High_Flux_Isotope_Reactor Valid pdf: High_Flux_Isotope_Reactor Random page title: Fereej_Bin_Mahmoud Invalid pdf: Fereej_Bin_Mahmoud Random page title: Ian_Gibson_(author) Valid pdf: Ian_Gibson_(author) Random page title: Hours,_Pyrénées-Atlantiques Valid pdf: Hours,_Pyrénées-Atlantiques Random page title: Geomancer_(disambiguation) Valid pdf: Geomancer_(disambiguation) Random page title: Nelson_Mason Valid pdf: Nelson_Mason Random page title: Service_star Valid pdf: Service_star Random page title: One_Nite_Stand_(Of_Wolves_and_Sheep) Valid pdf: One_Nite_Stand_(Of_Wolves_and_Sheep) Random page title: Ron_Prince Valid pdf: Ron_Prince Random page title: Gulczewo,_Greater_Poland_Voivodeship Valid pdf: Gulczewo,_Greater_Poland_Voivodeship Random page title: Laski_Wielkie,_Kuyavian-Pomeranian_Voivodeship Invalid pdf: Laski_Wielkie,_Kuyavian-Pomeranian_Voivodeship Random page title: Coldsprings_Township,_Michigan Valid pdf: Coldsprings_Township,_Michigan Random page title: Arthur_Rhodes Valid pdf: Arthur_Rhodes Random page title: Lalpur_Upazila Valid pdf: Lalpur_Upazila Random page title: Mesenchyme Invalid pdf: Mesenchyme Random page title: Carlos_Casares_Partido Valid pdf: Carlos_Casares_Partido Random page title: Salem_Hospital_(Oregon) Valid pdf: Salem_Hospital_(Oregon) Random page title: Les_gitans Valid pdf: Les_gitans In [5]: valid Out[5]: 16 In [6]: invalid Out[6]: 4