Page MenuHomePhabricator
Authored By
siebrand
Nov 22 2019, 5:53 PM
Size
1 KB
Referenced Files
None
Subscribers
None

parse.py

from dataknead import Knead
from pathlib import Path
import json
import xmltodict
import re
KEYS = [
"dc:description",
"dc:date",
"dc:identifier",
"dcterms:spatial",
"dc:creator",
"europeana:isShownAt",
"europeana:type",
"europeana:rights",
"europeana:isShownBy"
]
BATCH_SIZE = 500
def load_xml(path):
with open(path) as f:
return xmltodict.parse(f.read())
def parse(d):
ret = {
"identifier" : d["header"]["identifier"]
}
for key in KEYS:
ret[key] = d["metadata"]["europeana:record"].get(key, None)
# Sometimes items have multiple descriptions...
if isinstance(ret[key], list):
# Filter out items with value None
ret[key] = list(filter((None).__ne__, ret[key]))
# Join elements with a space
ret[key] = ' '.join(ret[key])
# Clean up descriptions
if key == "dc:description" and ret[key] != None:
# Remove all line endings
ret[key] = ret[key].replace('\n', ' ').replace('\r', '')
# Remove all double spaces
ret[key] = re.sub(' +', ' ', ret[key])
return ret
def write_json(path, data):
with open(path, "w") as f:
f.write(json.dumps(data, indent = 4))
def main():
results = []
file_number = 1
for path in Path(".").glob("download_data/*.xml"):
data = load_xml(path)
records = data["OAI-PMH"]["ListRecords"]["record"]
records = [parse(r) for r in records]
results = results + records
chunks = [ results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE) ]
for index, chunk in enumerate(chunks):
Knead(chunk).write(f"results-{str(index).zfill(5)}.csv")
if __name__ == "__main__":
main()

File Metadata

Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
8231918
Default Alt Text
parse.py (1 KB)

Event Timeline