Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F34098594
spaces-in-external-ids (T271126 investigation)
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
Lucas_Werkmeister_WMDE
Feb 9 2021, 4:14 PM
2021-02-09 16:14:30 (UTC+0)
Size
2 KB
Referenced Files
None
Subscribers
None
spaces-in-external-ids (T271126 investigation)
View Options
#!/usr/bin/env python3
from
SPARQLWrapper
import
SPARQLWrapper
,
JSON
endpoint_url
=
'https://query.wikidata.org/sparql'
user_agent
=
'T271126 analysis (lucas.werkmeister@wikimedia.de)'
sparql
=
SPARQLWrapper
(
endpoint_url
,
agent
=
user_agent
)
sparql
.
setReturnFormat
(
JSON
)
def
get_external_id_property_ids
():
query
=
'''
SELECT ?property WHERE {
?property wikibase:propertyType wikibase:ExternalId.
FILTER EXISTS { ?property wdt:P1630 ?formatterUrl. }
}
'''
sparql
.
setQuery
(
query
)
results
=
sparql
.
query
()
.
convert
()[
'results'
][
'bindings'
]
return
[
result
[
'property'
][
'value'
][
len
(
'http://www.wikidata.org/entity/'
):]
for
result
in
results
]
def
get_counts
(
property_id
):
query
=
'''
SELECT (SUM(IF(COALESCE(CONTAINS(?id, " "), false), 1, 0)) AS ?withSpace) (COUNT(*) AS ?total) WHERE {
SERVICE bd:sample {
?subject wdt:
%s
?id.
bd:serviceParam bd:sample.limit 10000
}
}
'''
%
property_id
sparql
.
setQuery
(
query
)
result
=
sparql
.
query
()
.
convert
()[
'results'
][
'bindings'
][
0
]
return
int
(
result
[
'withSpace'
][
'value'
]),
int
(
result
[
'total'
][
'value'
])
properties_without_space
=
0
properties_with_space
=
{}
properties_with_error
=
{}
external_property_ids
=
get_external_id_property_ids
()[:
50
]
try
:
from
progress.bar
import
IncrementalBar
property_ids
=
IncrementalBar
(
'Running'
,
suffix
=
'
%(index)d
/
%(max)d
,
%(eta_td)s
remaining'
)
.
iter
(
external_property_ids
)
except
ImportError
:
property_ids
=
external_property_ids
for
property_id
in
property_ids
:
try
:
with_space
,
total
=
get_counts
(
property_id
)
except
Exception
as
e
:
properties_with_error
[
property_id
]
=
e
else
:
if
with_space
:
properties_with_space
[
property_id
]
=
(
with_space
,
total
)
else
:
properties_without_space
+=
1
if
properties_with_error
:
print
(
f
'Errors encountered with the following
{
len
(
properties_with_error
)
}
properties:'
)
for
property_id
,
e
in
properties_with_error
.
items
():
print
(
property_id
)
print
(
e
)
if
properties_with_space
:
print
(
f
'Spaces found in the following
{
len
(
properties_with_space
)
}
properties:'
)
for
property_id
,
(
with_space
,
total
)
in
sorted
(
properties_with_space
.
items
(),
key
=
lambda
item
:
(
item
[
1
][
0
]
/
item
[
1
][
1
],
item
[
1
][
1
])):
ratio
=
with_space
/
total
print
(
f
'
{
property_id
:
>5
}
:
{
ratio
*
100
:
6.2f
}
% (
{
with_space
:
5
}
/
{
total
:
5
}
)'
)
print
(
f
'No spaces found in
{
properties_without_space
}
out of
{
properties_without_space
+
len
(
properties_with_space
)
+
len
(
properties_with_error
)
}
IDs.'
)
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8887072
Default Alt Text
spaces-in-external-ids (T271126 investigation) (2 KB)
Attached To
Mode
P14273 spaces-in-external-ids (T271126 investigation)
Attached
Detach File
Event Timeline
Log In to Comment