Page MenuHomePhabricator
Paste P13257

More pdf stats
ActivePublic

Authored by akosiaris on Nov 10 2020, 1:19 PM.
Tags
Referenced Files
F33906846: More pdf stats
Nov 10 2020, 2:19 PM
F33904373: More pdf stats
Nov 10 2020, 1:58 PM
F33901611: More pdf stats
Nov 10 2020, 1:22 PM
F33901421: More pdf stats
Nov 10 2020, 1:19 PM
Subscribers
None
"""
Just get some basic stats from Proton externally
"""
import io
import pprint
import random
import requests
from pdfminer.high_level import extract_text
stats = {}
WIKIS = ['en', 'es', 'el', 'it', 'fr', 'simple', 'de', 'ar', 'bg', 'no', 'tr']
for i in range(100):
wiki = random.choice(WIKIS)
if not wiki in stats.keys():
stats[wiki] = {
'cl-matches-bytes': 0,
'cl-fails-bytes': 0,
'valid': 0,
'invalid': 0,
}
r = requests.get('https://{}.wikipedia.org/api/rest_v1/page/random/title'.format(wiki))
title = r.json()['items'][0]['title']
print('Random page title: {}'.format(title))
pdf = requests.get('https://{}.wikipedia.org/api/rest_v1/page/pdf/{}'.format(wiki, title))
code = pdf.status_code
if code == 404:
print('https://{}.wikipedia.org/api/rest_v1/page/pdf/{} 404ed'.format(wiki, title))
if not code in stats.keys():
stats[code] = {
'cl-matches-bytes': 0,
'cl-fails-bytes': 0,
'valid': 0,
'invalid': 0,
}
if int(pdf.headers['content-length']) == len(pdf.content):
stats[code]['cl-matches-bytes'] += 1
stats[wiki]['cl-matches-bytes'] += 1
else:
stats[code]['cl-fails-bytes'] += 1
stats[wiki]['cl-fails-bytes'] += 1
f = io.BytesIO(pdf.content)
try:
extract_text(f)
stats[code]['valid'] += 1
stats[wiki]['valid'] += 1
except :
stats[code]['invalid'] += 1
stats[wiki]['invalid'] += 1
pprint.pprint(stats)