Page MenuHomePhabricator
Paste P8168

(An Untitled Masterwork)
ActivePublic

Authored by Addshore on Mar 7 2019, 10:03 AM.
Tags
None
Referenced Files
F28340947: raw.txt
Mar 7 2019, 10:57 AM
F28340671: raw.txt
Mar 7 2019, 10:12 AM
F28340637: raw.txt
Mar 7 2019, 10:03 AM
Subscribers
None
----------Cell 1----------------------------------------------------------------------
val df = spark.read.parquet("/user/joal/wmf/data/wmf/mediawiki/wikidata_parquet/20190204")
val base_rdd = df.select("labels", "descriptions", "aliases").rdd
----------Cell 2----------------------------------------------------------------------
val s_l = base_rdd.flatMap(r => { r.getMap[String,String](0).values })
val s_d = base_rdd.flatMap(r => { r.getMap[String,String](1).values })
val s_a = base_rdd.flatMap(r => { r.getMap[String,Seq[String]](2).values.flatMap(l => l) })
val s_la = base_rdd.flatMap(r => {
r.getMap[String,String](0).values ++
r.getMap[String,Seq[String]](2).values.flatMap(l => l)
})
val s_lad = base_rdd.flatMap(r => {
r.getMap[String,String](0).values ++
r.getMap[String,String](1).values ++
r.getMap[String,Seq[String]](2).values.flatMap(l => l)
})
----------Cell 3----------------------------------------------------------------------
val collections = Array(s_lad, s_la, s_l, s_d, s_a)
for (i <- 0 until collections.length) {
val strings = collections(i);
val grouped_strings = strings.map(s => (s, 1)).reduceByKey(_+_)
val total_bytes = grouped_strings.map(t => t._1.getBytes.length * t._2).sum()
val duplicate_bytes = grouped_strings.map(t => t._1.getBytes.length * (t._2 - 1)).sum()
// Triple check usefull bytes for strings:
grouped_strings.map(_._1.getBytes.length).sum() == (total_bytes - duplicate_bytes)
val all_strings = strings.count()
val unique_strings = grouped_strings.count()
val oneoc_strings = grouped_strings.filter(t => t._2 == 1).count()
println(f"-----------------------------------------------------")
println(f"Total bytes for strings: $total_bytes%15.0f")
println(f"Total duplicate bytes for strings: $duplicate_bytes%15.0f")
println(f"Useful bytes for strings: ${total_bytes - duplicate_bytes}%15.0f")
println(f"Total strings: $all_strings%15.0f")
println(f"Total unique strings: $unique_strings%15.0f")
println(f"Total one occurrence strings: $oneoc_strings%15.0f")
println(f"-----------------------------------------------------")
}