Page MenuHomePhabricator
Paste P15106

easy_scan_for_cross_wiki_joins.py
ActivePublic

Authored by Bstorm on Apr 1 2021, 7:15 PM.
Tags
None
Referenced Files
F34209966: easy_scan_for_cross_wiki_joins.py
Apr 1 2021, 7:15 PM
Subscribers
None
Tokens
"100" token, awarded by Jhernandez.
#!/usr/bin/env python3
import re
from typing import Set
from openpyxl import load_workbook
import sqlparse
db_patterns = ["centralauth", "meta_p"]
def _get_dbs_from_parsed_query_sqlparse(db, parsed_query, regex) -> Set[str]:
probably_db_identifiers = set()
probably_db_identifiers.add(db)
for token in parsed_query.tokens:
if isinstance(token, sqlparse.sql.Identifier) and "." in token.value:
if token.value.split(".")[0] in db_patterns or regex.match(
token.value.split(".")[0]
):
probably_db_identifiers.add(token.value.split(".")[0])
elif isinstance(token, sqlparse.sql.Identifier) and (
regex.match(token.value) or token.value in db_patterns
):
probably_db_identifiers.add(token.value)
# print(
# f"##### DB identifiers (sqlparse): {probably_db_identifiers}",
# )
return probably_db_identifiers
def is_cross_database(db, sql, regex):
parsed_queries = sqlparse.parse(sql, "utf-8")
for parsed_query in parsed_queries:
if len(_get_dbs_from_parsed_query_sqlparse(db, parsed_query, regex)) > 1:
return True
return False
def main():
wikidb = re.compile(r"[a-z]+wik.*_p")
print("hi there!")
crossqueries = 0
tally = 1
wb = load_workbook(filename="firehose1.xlsx")
sheet = wb["Sheet1"]
for row in sheet.iter_rows(
min_row=2, max_row=60008, min_col=1, max_col=4, values_only=True
):
# print(row)
try:
multi = is_cross_database(row[2], row[3], wikidb)
except Exception as e:
print(e)
print("oops")
print(f"On {tally}, where the query was {row[3]}")
tally += 1
if multi:
crossqueries += 1
print(f"I get {crossqueries} out of a total of {tally}")
if __name__ == "__main__":
main()