Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F28340947
raw.txt
No One
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
Addshore
Mar 7 2019, 10:57 AM
2019-03-07 10:57:52 (UTC+0)
Size
2 KB
Referenced Files
None
Subscribers
None
raw.txt
View Options
----------Cell 1----------------------------------------------------------------------
val df = spark.read.parquet("/user/joal/wmf/data/wmf/mediawiki/wikidata_parquet/20190204")
val base_rdd = df.select("labels", "descriptions", "aliases").rdd
----------Cell 2----------------------------------------------------------------------
val s_l = base_rdd.flatMap(r => { r.getMap[String,String](0).values })
val s_d = base_rdd.flatMap(r => { r.getMap[String,String](1).values })
val s_a = base_rdd.flatMap(r => { r.getMap[String,Seq[String]](2).values.flatMap(l => l) })
val s_la = base_rdd.flatMap(r => {
r.getMap[String,String](0).values ++
r.getMap[String,Seq[String]](2).values.flatMap(l => l)
})
val s_lad = base_rdd.flatMap(r => {
r.getMap[String,String](0).values ++
r.getMap[String,String](1).values ++
r.getMap[String,Seq[String]](2).values.flatMap(l => l)
})
----------Cell 3----------------------------------------------------------------------
val collections = Array(s_lad, s_la, s_l, s_d, s_a)
for (i <- 0 until collections.length) {
val strings = collections(i);
val grouped_strings = strings.map(s => (s, 1)).reduceByKey(_+_)
val total_bytes = grouped_strings.map(t => t._1.getBytes.length * t._2).sum()
val duplicate_bytes = grouped_strings.map(t => t._1.getBytes.length * (t._2 - 1)).sum()
// Triple check usefull bytes for strings:
grouped_strings.map(_._1.getBytes.length).sum() == (total_bytes - duplicate_bytes)
val all_strings = strings.count()
val unique_strings = grouped_strings.count()
val oneoc_strings = grouped_strings.filter(t => t._2 == 1).count()
println(f"-----------------------------------------------------")
println(f"Total bytes for strings: $total_bytes%15.0f")
println(f"Total duplicate bytes for strings: $duplicate_bytes%15.0f")
println(f"Useful bytes for strings: ${total_bytes - duplicate_bytes}%15.0f")
println(f"Total strings: $all_strings%15.0f")
println(f"Total unique strings: $unique_strings%15.0f")
println(f"Total one occurrence strings: $oneoc_strings%15.0f")
println(f"-----------------------------------------------------")
}
File Metadata
Details
Attached
Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
7160374
Default Alt Text
raw.txt (2 KB)
Attached To
Mode
P8168 (An Untitled Masterwork)
Attached
Detach File
Event Timeline
Log In to Comment