Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F31133346
parse.py
siebrand (Siebrand Mazeland)
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
siebrand
Nov 22 2019, 5:53 PM
2019-11-22 17:53:53 (UTC+0)
Size
1 KB
Referenced Files
None
Subscribers
None
parse.py
View Options
from
dataknead
import
Knead
from
pathlib
import
Path
import
json
import
xmltodict
import
re
KEYS
=
[
"dc:description"
,
"dc:date"
,
"dc:identifier"
,
"dcterms:spatial"
,
"dc:creator"
,
"europeana:isShownAt"
,
"europeana:type"
,
"europeana:rights"
,
"europeana:isShownBy"
]
BATCH_SIZE
=
500
def
load_xml
(
path
):
with
open
(
path
)
as
f
:
return
xmltodict
.
parse
(
f
.
read
())
def
parse
(
d
):
ret
=
{
"identifier"
:
d
[
"header"
][
"identifier"
]
}
for
key
in
KEYS
:
ret
[
key
]
=
d
[
"metadata"
][
"europeana:record"
]
.
get
(
key
,
None
)
# Sometimes items have multiple descriptions...
if
isinstance
(
ret
[
key
],
list
):
# Filter out items with value None
ret
[
key
]
=
list
(
filter
((
None
)
.
__ne__
,
ret
[
key
]))
# Join elements with a space
ret
[
key
]
=
' '
.
join
(
ret
[
key
])
# Clean up descriptions
if
key
==
"dc:description"
and
ret
[
key
]
!=
None
:
# Remove all line endings
ret
[
key
]
=
ret
[
key
]
.
replace
(
'
\n
'
,
' '
)
.
replace
(
'
\r
'
,
''
)
# Remove all double spaces
ret
[
key
]
=
re
.
sub
(
' +'
,
' '
,
ret
[
key
])
return
ret
def
write_json
(
path
,
data
):
with
open
(
path
,
"w"
)
as
f
:
f
.
write
(
json
.
dumps
(
data
,
indent
=
4
))
def
main
():
results
=
[]
file_number
=
1
for
path
in
Path
(
"."
)
.
glob
(
"download_data/*.xml"
):
data
=
load_xml
(
path
)
records
=
data
[
"OAI-PMH"
][
"ListRecords"
][
"record"
]
records
=
[
parse
(
r
)
for
r
in
records
]
results
=
results
+
records
chunks
=
[
results
[
i
:
i
+
BATCH_SIZE
]
for
i
in
range
(
0
,
len
(
results
),
BATCH_SIZE
)
]
for
index
,
chunk
in
enumerate
(
chunks
):
Knead
(
chunk
)
.
write
(
f
"results-{str(index).zfill(5)}.csv"
)
if
__name__
==
"__main__"
:
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8231918
Default Alt Text
parse.py (1 KB)
Attached To
Mode
T235995: Upload photos from Elsinga collection at the Alkmaar archives (+/- 10,000 images)
Attached
Detach File
Event Timeline
Log In to Comment