namespaceCalculations.py
James_Budday
Actions

Authored By

	James_Budday
	Aug 3 2017, 11:54 AM

Size

7 KB

Referenced Files

None

Subscribers

None

namespaceCalculations.py
View Options

	#written for python 3

	import csv
	import sys
	import matplotlib.pyplot as plt

	#these are the variables to modifify to set up yourself
	#sourceFilename = "all(without all).csv"
	sourceFilename = "all.csv"
	project = "wikipedia"
	lang = "en"

	#dictionary for column names to array index conversions
	col = {'project': 0, 'lang': 1, 'language': 2, 'namespaces': 3, 'numberOfSearches': 4, 'proportion': 5}
	namespaces = {}
	pairs = {}
	triples = {} #not implemented
	totalSearches = 0
	totalNamespacesChosen = 0
	totalNumberOfTalks = 0.0
	totalSearchesWithATalk = 0

	exactlyXNamespacesChosen = [0] * 35 #arbitrary number that is bigger than the number of total namespaces used in a search
	#so that I don't have to write logic to make sure the array is always the right size and 0
	lessThanXNamespacesChosen = [0] * 22 #20 namespaces is very near all for some namespaces
	moreThanXNamespacesChosen = [0] * 22


	################################################################################################
	# read from file and parse
	################################################################################################

	reader = csv.reader(open(sourceFilename))
	next(reader) #skip header row

	try:
	for row in reader:

	#debug stuff
	#print ('Row read with success!', r)
	#print(type(row))

	#row = np.asarray(r)

	#print(row[col['project']] == project)
	#print(row[col['lang']] == lang)

	if (row[col['project']] == project) and (row[col['lang']] == lang):
	#if (row[col['project']] == project): #to make language independent uncomment this line and comment previous if
	#print('\n\n')
	#print('row')
	if (("talk" in row[col['namespaces']]) or ("Talk" in row[col['namespaces']])):
	totalNumberOfTalks += (row[col['namespaces']].lower().count("talk") * int(row[col['numberOfSearches']]))
	totalSearchesWithATalk += int(row[col['numberOfSearches']])

	#to be used to track pairs and trips
	namesInRow = []

	#split the cell about namespaces and loop over it
	namespaceRow = row[col['namespaces']].split(',')

	totalSearches += int(row[col['numberOfSearches']])
	totalNamespacesChosen += len(namespaceRow) * int(row[col['numberOfSearches']])

	for i in range(1, len(exactlyXNamespacesChosen)):
	if(len(namespaceRow) == i):
	exactlyXNamespacesChosen[i] += int(row[col['numberOfSearches']])

	for i in range(2, len(lessThanXNamespacesChosen)):
	if(len(namespaceRow) < i):
	lessThanXNamespacesChosen[i] += int(row[col['numberOfSearches']])

	for i in range(0, len(moreThanXNamespacesChosen)):
	if(len(namespaceRow) > i):
	moreThanXNamespacesChosen[i] += int(row[col['numberOfSearches']])

	for currentNamespace in namespaceRow:
	currentNamespace = currentNamespace.strip()
	namesInRow.append(currentNamespace)

	#have I found this namespace before?
	alreadyFound = False
	for namespace in namespaces:
	if currentNamespace == namespace:
	alreadyFound = True


	if not alreadyFound:

	#make all pairs
	if len(namespaces) > 0:
	for name in namespaces:
	#print("in pair zone")
	pairs[currentNamespace + ',' + name] = 0
	#print(pairs)
	#make all triples
	#if len(namespaces) > 1:
	# for name in namespaces:
	#print("in triple zone")


	#add to found
	namespaces[currentNamespace] = 0

	#add to count sinples
	namespaces[currentNamespace] += int(row[col['numberOfSearches']])

	#print ('Row read with success!', row)
	#ad to count doubles
	#print('number of pairs' + str(len(pairs)))
	for pair in pairs:
	names = pair.split(',')
	found0 = False
	found1 = False

	#print(names)

	for cN in namespaceRow:
	cN = cN.strip()

	if cN == names[0]:
	found0 = True

	if cN == names[1]:
	found1 = True

	if found0 & found1:
	#print("found both")
	pairs[pair] += int(row[col['numberOfSearches']])
	#print(pairs[pair])
	#print(".")

	except csv.Error as e:
	sys.exit('file %s, line %d: %s' % (sourceFilename, reader.line_num, e))

	################################################################################################
	# write to file
	################################################################################################
	writeFilename = project + '-' + lang + '-pairs.csv'

	with open(writeFilename, 'w', newline='\n') as givenFile:
	writer = csv.writer(givenFile, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)

	writer.writerow(['Chosen Namespace', 'Pair That Would Be Created', 'Total Pairings', 'Percent Chance'])
	for name in namespaces:
	#file.write('Word: ' + name + '\n')
	for pair in pairs:

	namepair = pair.split(',')
	if namepair[0] == name or namepair[1] == name:
	writer.writerow([name, str(pair), str(pairs[pair]), str(float(pairs[pair]) / float(namespaces[name]))])


	################################################################################################
	# output and graph
	################################################################################################
	print("finished creating " + writeFilename + " in the same folder as this program")
	print("\n-----------------------\n")

	print("average number of talks per search given 1 talk is present: " + str(totalNumberOfTalks / totalSearchesWithATalk))
	print("average number of talks per search in any search: " + str(totalNumberOfTalks / totalSearches))
	print("average number of namespaces chosen in any search: " + str(totalNamespacesChosen / totalSearches))
	print("\n------------------------\n")

	print("percentage of searches with exactly 1 namespace chosen: " + str(exactlyXNamespacesChosen[1] / totalSearches))
	print("percentage of searches with less than 5 namespaces chosen: " + str(lessThanXNamespacesChosen[5] / totalSearches))
	print("percentage of searches with more than 20 namespaces chosen: " + str(moreThanXNamespacesChosen[20] / totalSearches))

	# for namespace, count in namespaces.items():
	# print(namespace)
	# print(count)

	plt.bar(range(len(exactlyXNamespacesChosen)), exactlyXNamespacesChosen, align='center')
	plt.title('Number of namespaces chosen in a given search')
	plt.xlabel('Namespaces')
	plt.ylabel('Searches count')
	plt.show()


	#unused (not very useful) graphs

	# plt.bar(range(len(lessThanXNamespacesChosen)), lessThanXNamespacesChosen, align='center')
	# plt.title('Less than x namespaces chosen in a given search')
	# plt.xlabel('Less than x namespaces')
	# plt.ylabel('Searches count')
	# plt.show()

	# plt.bar(range(len(moreThanXNamespacesChosen)), moreThanXNamespacesChosen, align='center')
	# plt.title('More than x namespaces chosen in a given search')
	# plt.xlabel('More than x namespaces')
	# plt.ylabel('Searches count')
	# plt.show()


	# plt.bar(range(len(pairs)), pairs.values(), align='center')
	# plt.xticks(range(len(pairs)), pairs.keys())

	# plt.show()

File Metadata

Mime Type: text/x-python
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 4818449
Default Alt Text: namespaceCalculations.py (7 KB)

namespaceCalculations.pyJames_BuddayActions

namespaceCalculations.pyView Options

File Metadata

Event Timeline

namespaceCalculations.py
James_Budday
Actions

namespaceCalculations.py
View Options