Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F8938500
namespaceCalculations.py
James_Budday
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
James_Budday
Aug 3 2017, 11:54 AM
2017-08-03 11:54:13 (UTC+0)
Size
7 KB
Referenced Files
None
Subscribers
None
namespaceCalculations.py
View Options
#written for python 3
import
csv
import
sys
import
matplotlib.pyplot
as
plt
#these are the variables to modifify to set up yourself
#sourceFilename = "all(without all).csv"
sourceFilename
=
"all.csv"
project
=
"wikipedia"
lang
=
"en"
#dictionary for column names to array index conversions
col
=
{
'project'
:
0
,
'lang'
:
1
,
'language'
:
2
,
'namespaces'
:
3
,
'numberOfSearches'
:
4
,
'proportion'
:
5
}
namespaces
=
{}
pairs
=
{}
triples
=
{}
#not implemented
totalSearches
=
0
totalNamespacesChosen
=
0
totalNumberOfTalks
=
0.0
totalSearchesWithATalk
=
0
exactlyXNamespacesChosen
=
[
0
]
*
35
#arbitrary number that is bigger than the number of total namespaces used in a search
#so that I don't have to write logic to make sure the array is always the right size and 0
lessThanXNamespacesChosen
=
[
0
]
*
22
#20 namespaces is very near all for some namespaces
moreThanXNamespacesChosen
=
[
0
]
*
22
################################################################################################
# read from file and parse
################################################################################################
reader
=
csv
.
reader
(
open
(
sourceFilename
))
next
(
reader
)
#skip header row
try
:
for
row
in
reader
:
#debug stuff
#print ('Row read with success!', r)
#print(type(row))
#row = np.asarray(r)
#print(row[col['project']] == project)
#print(row[col['lang']] == lang)
if
(
row
[
col
[
'project'
]]
==
project
)
and
(
row
[
col
[
'lang'
]]
==
lang
):
#if (row[col['project']] == project): #to make language independent uncomment this line and comment previous if
#print('\n\n')
#print('row')
if
((
"talk"
in
row
[
col
[
'namespaces'
]])
or
(
"Talk"
in
row
[
col
[
'namespaces'
]])):
totalNumberOfTalks
+=
(
row
[
col
[
'namespaces'
]]
.
lower
()
.
count
(
"talk"
)
*
int
(
row
[
col
[
'numberOfSearches'
]]))
totalSearchesWithATalk
+=
int
(
row
[
col
[
'numberOfSearches'
]])
#to be used to track pairs and trips
namesInRow
=
[]
#split the cell about namespaces and loop over it
namespaceRow
=
row
[
col
[
'namespaces'
]]
.
split
(
','
)
totalSearches
+=
int
(
row
[
col
[
'numberOfSearches'
]])
totalNamespacesChosen
+=
len
(
namespaceRow
)
*
int
(
row
[
col
[
'numberOfSearches'
]])
for
i
in
range
(
1
,
len
(
exactlyXNamespacesChosen
)):
if
(
len
(
namespaceRow
)
==
i
):
exactlyXNamespacesChosen
[
i
]
+=
int
(
row
[
col
[
'numberOfSearches'
]])
for
i
in
range
(
2
,
len
(
lessThanXNamespacesChosen
)):
if
(
len
(
namespaceRow
)
<
i
):
lessThanXNamespacesChosen
[
i
]
+=
int
(
row
[
col
[
'numberOfSearches'
]])
for
i
in
range
(
0
,
len
(
moreThanXNamespacesChosen
)):
if
(
len
(
namespaceRow
)
>
i
):
moreThanXNamespacesChosen
[
i
]
+=
int
(
row
[
col
[
'numberOfSearches'
]])
for
currentNamespace
in
namespaceRow
:
currentNamespace
=
currentNamespace
.
strip
()
namesInRow
.
append
(
currentNamespace
)
#have I found this namespace before?
alreadyFound
=
False
for
namespace
in
namespaces
:
if
currentNamespace
==
namespace
:
alreadyFound
=
True
if
not
alreadyFound
:
#make all pairs
if
len
(
namespaces
)
>
0
:
for
name
in
namespaces
:
#print("in pair zone")
pairs
[
currentNamespace
+
','
+
name
]
=
0
#print(pairs)
#make all triples
#if len(namespaces) > 1:
# for name in namespaces:
#print("in triple zone")
#add to found
namespaces
[
currentNamespace
]
=
0
#add to count sinples
namespaces
[
currentNamespace
]
+=
int
(
row
[
col
[
'numberOfSearches'
]])
#print ('Row read with success!', row)
#ad to count doubles
#print('number of pairs' + str(len(pairs)))
for
pair
in
pairs
:
names
=
pair
.
split
(
','
)
found0
=
False
found1
=
False
#print(names)
for
cN
in
namespaceRow
:
cN
=
cN
.
strip
()
if
cN
==
names
[
0
]:
found0
=
True
if
cN
==
names
[
1
]:
found1
=
True
if
found0
&
found1
:
#print("found both")
pairs
[
pair
]
+=
int
(
row
[
col
[
'numberOfSearches'
]])
#print(pairs[pair])
#print(".")
except
csv
.
Error
as
e
:
sys
.
exit
(
'file
%s
, line
%d
:
%s
'
%
(
sourceFilename
,
reader
.
line_num
,
e
))
################################################################################################
# write to file
################################################################################################
writeFilename
=
project
+
'-'
+
lang
+
'-pairs.csv'
with
open
(
writeFilename
,
'w'
,
newline
=
'
\n
'
)
as
givenFile
:
writer
=
csv
.
writer
(
givenFile
,
delimiter
=
','
,
quotechar
=
'
\"
'
,
quoting
=
csv
.
QUOTE_MINIMAL
)
writer
.
writerow
([
'Chosen Namespace'
,
'Pair That Would Be Created'
,
'Total Pairings'
,
'Percent Chance'
])
for
name
in
namespaces
:
#file.write('Word: ' + name + '\n')
for
pair
in
pairs
:
namepair
=
pair
.
split
(
','
)
if
namepair
[
0
]
==
name
or
namepair
[
1
]
==
name
:
writer
.
writerow
([
name
,
str
(
pair
),
str
(
pairs
[
pair
]),
str
(
float
(
pairs
[
pair
])
/
float
(
namespaces
[
name
]))])
################################################################################################
# output and graph
################################################################################################
print
(
"finished creating "
+
writeFilename
+
" in the same folder as this program"
)
print
(
"
\n
-----------------------
\n
"
)
print
(
"average number of talks per search given 1 talk is present: "
+
str
(
totalNumberOfTalks
/
totalSearchesWithATalk
))
print
(
"average number of talks per search in any search: "
+
str
(
totalNumberOfTalks
/
totalSearches
))
print
(
"average number of namespaces chosen in any search: "
+
str
(
totalNamespacesChosen
/
totalSearches
))
print
(
"
\n
------------------------
\n
"
)
print
(
"percentage of searches with exactly 1 namespace chosen: "
+
str
(
exactlyXNamespacesChosen
[
1
]
/
totalSearches
))
print
(
"percentage of searches with less than 5 namespaces chosen: "
+
str
(
lessThanXNamespacesChosen
[
5
]
/
totalSearches
))
print
(
"percentage of searches with more than 20 namespaces chosen: "
+
str
(
moreThanXNamespacesChosen
[
20
]
/
totalSearches
))
# for namespace, count in namespaces.items():
# print(namespace)
# print(count)
plt
.
bar
(
range
(
len
(
exactlyXNamespacesChosen
)),
exactlyXNamespacesChosen
,
align
=
'center'
)
plt
.
title
(
'Number of namespaces chosen in a given search'
)
plt
.
xlabel
(
'Namespaces'
)
plt
.
ylabel
(
'Searches count'
)
plt
.
show
()
#unused (not very useful) graphs
# plt.bar(range(len(lessThanXNamespacesChosen)), lessThanXNamespacesChosen, align='center')
# plt.title('Less than x namespaces chosen in a given search')
# plt.xlabel('Less than x namespaces')
# plt.ylabel('Searches count')
# plt.show()
# plt.bar(range(len(moreThanXNamespacesChosen)), moreThanXNamespacesChosen, align='center')
# plt.title('More than x namespaces chosen in a given search')
# plt.xlabel('More than x namespaces')
# plt.ylabel('Searches count')
# plt.show()
# plt.bar(range(len(pairs)), pairs.values(), align='center')
# plt.xticks(range(len(pairs)), pairs.keys())
# plt.show()
File Metadata
Details
Attached
Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4818449
Default Alt Text
namespaceCalculations.py (7 KB)
Attached To
Mode
T165492: Find out which namespace combinations are used for searching
Attached
Detach File
Event Timeline
Log In to Comment