Page MenuHomePhabricator
Paste P14273

spaces-in-external-ids (T271126 investigation)
ActivePublic

Authored by Lucas_Werkmeister_WMDE on Feb 9 2021, 3:48 PM.
#!/usr/bin/env python3
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint_url = 'https://query.wikidata.org/sparql'
user_agent = 'T271126 analysis (lucas.werkmeister@wikimedia.de)'
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setReturnFormat(JSON)
def get_external_id_property_ids():
query = '''
SELECT ?property WHERE {
?property wikibase:propertyType wikibase:ExternalId.
FILTER EXISTS { ?property wdt:P1630 ?formatterUrl. }
}
'''
sparql.setQuery(query)
results = sparql.query().convert()['results']['bindings']
return [result['property']['value'][len('http://www.wikidata.org/entity/'):]
for result in results]
def get_counts(property_id):
query = '''
SELECT (SUM(IF(COALESCE(CONTAINS(?id, " "), false), 1, 0)) AS ?withSpace) (COUNT(*) AS ?total) WHERE {
SERVICE bd:sample {
?subject wdt:%s ?id.
bd:serviceParam bd:sample.limit 10000
}
}
''' % property_id
sparql.setQuery(query)
result = sparql.query().convert()['results']['bindings'][0]
return int(result['withSpace']['value']), int(result['total']['value'])
properties_without_space = 0
properties_with_space = {}
properties_with_error = {}
external_property_ids = get_external_id_property_ids()
try:
from progress.bar import IncrementalBar
property_ids = IncrementalBar('Running', suffix='%(index)d/%(max)d, %(eta_td)s remaining').iter(external_property_ids)
except ImportError:
property_ids = external_property_ids
for property_id in property_ids:
try:
with_space, total = get_counts(property_id)
except Exception as e:
properties_with_error[property_id] = e
else:
if with_space:
properties_with_space[property_id] = (with_space, total)
else:
properties_without_space += 1
if properties_with_error:
print(f'Errors encountered with the following {len(properties_with_error)} properties:')
for property_id, e in properties_with_error.items():
print(property_id)
print(e)
if properties_with_space:
print(f'Spaces found in the following {len(properties_with_space)} properties:')
for property_id, (with_space, total) in sorted(properties_with_space.items(), key=lambda item: (item[1][0] / item[1][1], item[1][1])):
ratio = with_space / total
print(f'{property_id:>5}: {ratio * 100:6.2f}% ({with_space:5}/{total:5})')
print(f'No spaces found in {properties_without_space} out of {properties_without_space + len(properties_with_space) + len(properties_with_error)} IDs.')