Page MenuHomePhabricator

namespaceCalculations.py

Authored By
James_Budday
Aug 3 2017, 11:54 AM
Size
7 KB
Referenced Files
None
Subscribers
None

namespaceCalculations.py

#written for python 3
import csv
import sys
import matplotlib.pyplot as plt
#these are the variables to modifify to set up yourself
#sourceFilename = "all(without all).csv"
sourceFilename = "all.csv"
project = "wikipedia"
lang = "en"
#dictionary for column names to array index conversions
col = {'project': 0, 'lang': 1, 'language': 2, 'namespaces': 3, 'numberOfSearches': 4, 'proportion': 5}
namespaces = {}
pairs = {}
triples = {} #not implemented
totalSearches = 0
totalNamespacesChosen = 0
totalNumberOfTalks = 0.0
totalSearchesWithATalk = 0
exactlyXNamespacesChosen = [0] * 35 #arbitrary number that is bigger than the number of total namespaces used in a search
#so that I don't have to write logic to make sure the array is always the right size and 0
lessThanXNamespacesChosen = [0] * 22 #20 namespaces is very near all for some namespaces
moreThanXNamespacesChosen = [0] * 22
################################################################################################
# read from file and parse
################################################################################################
reader = csv.reader(open(sourceFilename))
next(reader) #skip header row
try:
for row in reader:
#debug stuff
#print ('Row read with success!', r)
#print(type(row))
#row = np.asarray(r)
#print(row[col['project']] == project)
#print(row[col['lang']] == lang)
if (row[col['project']] == project) and (row[col['lang']] == lang):
#if (row[col['project']] == project): #to make language independent uncomment this line and comment previous if
#print('\n\n')
#print('row')
if (("talk" in row[col['namespaces']]) or ("Talk" in row[col['namespaces']])):
totalNumberOfTalks += (row[col['namespaces']].lower().count("talk") * int(row[col['numberOfSearches']]))
totalSearchesWithATalk += int(row[col['numberOfSearches']])
#to be used to track pairs and trips
namesInRow = []
#split the cell about namespaces and loop over it
namespaceRow = row[col['namespaces']].split(',')
totalSearches += int(row[col['numberOfSearches']])
totalNamespacesChosen += len(namespaceRow) * int(row[col['numberOfSearches']])
for i in range(1, len(exactlyXNamespacesChosen)):
if(len(namespaceRow) == i):
exactlyXNamespacesChosen[i] += int(row[col['numberOfSearches']])
for i in range(2, len(lessThanXNamespacesChosen)):
if(len(namespaceRow) < i):
lessThanXNamespacesChosen[i] += int(row[col['numberOfSearches']])
for i in range(0, len(moreThanXNamespacesChosen)):
if(len(namespaceRow) > i):
moreThanXNamespacesChosen[i] += int(row[col['numberOfSearches']])
for currentNamespace in namespaceRow:
currentNamespace = currentNamespace.strip()
namesInRow.append(currentNamespace)
#have I found this namespace before?
alreadyFound = False
for namespace in namespaces:
if currentNamespace == namespace:
alreadyFound = True
if not alreadyFound:
#make all pairs
if len(namespaces) > 0:
for name in namespaces:
#print("in pair zone")
pairs[currentNamespace + ',' + name] = 0
#print(pairs)
#make all triples
#if len(namespaces) > 1:
# for name in namespaces:
#print("in triple zone")
#add to found
namespaces[currentNamespace] = 0
#add to count sinples
namespaces[currentNamespace] += int(row[col['numberOfSearches']])
#print ('Row read with success!', row)
#ad to count doubles
#print('number of pairs' + str(len(pairs)))
for pair in pairs:
names = pair.split(',')
found0 = False
found1 = False
#print(names)
for cN in namespaceRow:
cN = cN.strip()
if cN == names[0]:
found0 = True
if cN == names[1]:
found1 = True
if found0 & found1:
#print("found both")
pairs[pair] += int(row[col['numberOfSearches']])
#print(pairs[pair])
#print(".")
except csv.Error as e:
sys.exit('file %s, line %d: %s' % (sourceFilename, reader.line_num, e))
################################################################################################
# write to file
################################################################################################
writeFilename = project + '-' + lang + '-pairs.csv'
with open(writeFilename, 'w', newline='\n') as givenFile:
writer = csv.writer(givenFile, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['Chosen Namespace', 'Pair That Would Be Created', 'Total Pairings', 'Percent Chance'])
for name in namespaces:
#file.write('Word: ' + name + '\n')
for pair in pairs:
namepair = pair.split(',')
if namepair[0] == name or namepair[1] == name:
writer.writerow([name, str(pair), str(pairs[pair]), str(float(pairs[pair]) / float(namespaces[name]))])
################################################################################################
# output and graph
################################################################################################
print("finished creating " + writeFilename + " in the same folder as this program")
print("\n-----------------------\n")
print("average number of talks per search given 1 talk is present: " + str(totalNumberOfTalks / totalSearchesWithATalk))
print("average number of talks per search in any search: " + str(totalNumberOfTalks / totalSearches))
print("average number of namespaces chosen in any search: " + str(totalNamespacesChosen / totalSearches))
print("\n------------------------\n")
print("percentage of searches with exactly 1 namespace chosen: " + str(exactlyXNamespacesChosen[1] / totalSearches))
print("percentage of searches with less than 5 namespaces chosen: " + str(lessThanXNamespacesChosen[5] / totalSearches))
print("percentage of searches with more than 20 namespaces chosen: " + str(moreThanXNamespacesChosen[20] / totalSearches))
# for namespace, count in namespaces.items():
# print(namespace)
# print(count)
plt.bar(range(len(exactlyXNamespacesChosen)), exactlyXNamespacesChosen, align='center')
plt.title('Number of namespaces chosen in a given search')
plt.xlabel('Namespaces')
plt.ylabel('Searches count')
plt.show()
#unused (not very useful) graphs
# plt.bar(range(len(lessThanXNamespacesChosen)), lessThanXNamespacesChosen, align='center')
# plt.title('Less than x namespaces chosen in a given search')
# plt.xlabel('Less than x namespaces')
# plt.ylabel('Searches count')
# plt.show()
# plt.bar(range(len(moreThanXNamespacesChosen)), moreThanXNamespacesChosen, align='center')
# plt.title('More than x namespaces chosen in a given search')
# plt.xlabel('More than x namespaces')
# plt.ylabel('Searches count')
# plt.show()
# plt.bar(range(len(pairs)), pairs.values(), align='center')
# plt.xticks(range(len(pairs)), pairs.keys())
# plt.show()

File Metadata

Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4818449
Default Alt Text
namespaceCalculations.py (7 KB)

Event Timeline