#!/usr/bin/env python3 import re from typing import Set from openpyxl import load_workbook import sqlparse db_patterns = ["centralauth", "meta_p"] def _get_dbs_from_parsed_query_sqlparse(db, parsed_query, regex) -> Set[str]: probably_db_identifiers = set() probably_db_identifiers.add(db) for token in parsed_query.tokens: if isinstance(token, sqlparse.sql.Identifier) and "." in token.value: if token.value.split(".")[0] in db_patterns or regex.match( token.value.split(".")[0] ): probably_db_identifiers.add(token.value.split(".")[0]) elif isinstance(token, sqlparse.sql.Identifier) and ( regex.match(token.value) or token.value in db_patterns ): probably_db_identifiers.add(token.value) # print( # f"##### DB identifiers (sqlparse): {probably_db_identifiers}", # ) return probably_db_identifiers def is_cross_database(db, sql, regex): parsed_queries = sqlparse.parse(sql, "utf-8") for parsed_query in parsed_queries: if len(_get_dbs_from_parsed_query_sqlparse(db, parsed_query, regex)) > 1: return True return False def main(): wikidb = re.compile(r"[a-z]+wik.*_p") print("hi there!") crossqueries = 0 tally = 1 wb = load_workbook(filename="firehose1.xlsx") sheet = wb["Sheet1"] for row in sheet.iter_rows( min_row=2, max_row=60008, min_col=1, max_col=4, values_only=True ): # print(row) try: multi = is_cross_database(row[2], row[3], wikidb) except Exception as e: print(e) print("oops") print(f"On {tally}, where the query was {row[3]}") tally += 1 if multi: crossqueries += 1 print(f"I get {crossqueries} out of a total of {tally}") if __name__ == "__main__": main()