parse.py
siebrand (Siebrand Mazeland)
Actions

Authored By

	siebrand
	Nov 22 2019, 5:53 PM

Size

1 KB

Referenced Files

None

Subscribers

None

parse.py
View Options

	from dataknead import Knead
	from pathlib import Path
	import json
	import xmltodict
	import re

	KEYS = [
	"dc:description",
	"dc:date",
	"dc:identifier",
	"dcterms:spatial",
	"dc:creator",
	"europeana:isShownAt",
	"europeana:type",
	"europeana:rights",
	"europeana:isShownBy"
	]

	BATCH_SIZE = 500

	def load_xml(path):
	with open(path) as f:
	return xmltodict.parse(f.read())

	def parse(d):
	ret = {
	"identifier" : d["header"]["identifier"]
	}

	for key in KEYS:
	ret[key] = d["metadata"]["europeana:record"].get(key, None)
	# Sometimes items have multiple descriptions...
	if isinstance(ret[key], list):
	# Filter out items with value None
	ret[key] = list(filter((None).__ne__, ret[key]))
	# Join elements with a space
	ret[key] = ' '.join(ret[key])

	# Clean up descriptions
	if key == "dc:description" and ret[key] != None:
	# Remove all line endings
	ret[key] = ret[key].replace('\n', ' ').replace('\r', '')
	# Remove all double spaces
	ret[key] = re.sub(' +', ' ', ret[key])

	return ret

	def write_json(path, data):
	with open(path, "w") as f:
	f.write(json.dumps(data, indent = 4))

	def main():
	results = []
	file_number = 1

	for path in Path(".").glob("download_data/*.xml"):
	data = load_xml(path)
	records = data["OAI-PMH"]["ListRecords"]["record"]
	records = [parse(r) for r in records]
	results = results + records

	chunks = [ results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE) ]

	for index, chunk in enumerate(chunks):
	Knead(chunk).write(f"results-{str(index).zfill(5)}.csv")


	if __name__ == "__main__":
	main()