Page MenuHomePhabricator
Paste P80117

Run threshold analysis for enwiki in stat8
ActivePublic

Authored by gkyziridis on Jul 28 2025, 2:41 PM.
Tags
None
Referenced Files
F65686362: Run threshold analysis for enwiki in stat8
Jul 28 2025, 2:46 PM
F65686343: Run threshold analysis for enwiki in stat8
Jul 28 2025, 2:41 PM
Subscribers
QUERY =
WITH revert_risks AS (
SELECT
rr.wiki_db AS wiki_db,
rr.rev_id AS revision_id,
rev_revert_risk AS revert_risk_score,
rr.rev_timestamp AS edit_timestamp,
mwh.event_user_text,
mwh.event_user_revision_count AS user_edit_count,
-- was it a newly created account. defined as user that created account 90 days before edit
CASE WHEN rru.user_id IS NOT NULL
AND unix_timestamp(rr.rev_timestamp) - unix_timestamp(rru.user_registration_timestamp) < 7776000
THEN TRUE
ELSE FALSE
END AS is_new_account,
-- was the user anonymous
CASE
WHEN event_user_is_anonymous THEN TRUE
ELSE FALSE
END AS is_anon,
CASE
WHEN SIZE(event_user_is_bot_by_historical) = 0 THEN FALSE
ELSE TRUE
END AS user_is_bot,
-- edit reverts another edit
revision_is_identity_revert AS is_revert,
rr.page_title,
page_namespace_is_content AS namespace_is_content,
-- was this edit reverted
CASE
WHEN rr.rev_is_identity_reverted THEN TRUE
ELSE FALSE
END AS is_reverted,
-- time to revert
rr.rev_seconds_to_identity_revert AS time_to_revert,
-- was the user extended confirmed
CASE
WHEN ARRAY_CONTAINS(event_user_groups, 'extendedconfirmed') THEN TRUE
ELSE FALSE
END AS is_extendedconfirmed,
CASE
WHEN bc.user_text IS NOT NULL THEN TRUE
ELSE FALSE
END AS is_blocked,
CASE
WHEN amr.amr_rev_parent_id IS NOT NULL THEN TRUE
ELSE FALSE
END AS was_reverted_by_automoderator
FROM
risk_observatory.revert_risk_predictions rr
JOIN
wmf.mediawiki_history mwh
ON rr.wiki_db = mwh.wiki_db
AND rr.rev_id = mwh.revision_id
LEFT JOIN wmf_product.automoderator_monitoring_snapshot_daily amr
ON rr.rev_id = amr.amr_rev_parent_id
-- was reverted by automoderator edits
AND is_amr_revert
AND amr.wiki_db = 'enwiki'
AND amr.snapshot_date = '2025-02-15'
LEFT JOIN event.mediawiki_user_blocks_change bc
ON rr.user_name = bc.user_text
AND rr.wiki_db = bc.`database`
AND bc.`database` = 'enwiki'
-- user blocked after edit
-- AND bc.meta.dt > rr.rev_timestamp
-- join to users dataset to identify new users
LEFT JOIN mneisler.canonical_revertrisk_users_2024 rru
ON mwh.event_user_id = rru.user_id
AND mwh.wiki_db = rru.wiki_db
WHERE
mwh.snapshot = '2024-11'
-- limit to indonesian Wikipedia
AND rr.wiki_db == 'enwiki'
-- limit to only revisions assigned revert risk score
AND rev_revert_risk IS NOT NULL
-- exclude adminstrators
AND
(
event_user_groups IS NULL
OR NOT ARRAY_CONTAINS(mwh.event_user_groups_historical, 'sysop')
)
AND event_timestamp > "2024-01-01 00:00:00"
AND event_timestamp < "2024-11-05 00:00:00"
),
excl_self_reverts AS (
SELECT
rd.*
FROM
revert_risks rd
JOIN
wmf.mediawiki_history mwh
ON rd.revision_id = mwh.revision_first_identity_reverting_revision_id
AND rd.wiki_db = mwh.wiki_db
WHERE
snapshot = '2024-11'
AND rd.is_revert
-- exclude self reverts
AND NOT rd.event_user_text = mwh.event_user_text
)
SELECT
*
FROM
revert_risks
WHERE
NOT revert_risks.is_revert
UNION ALL
SELECT
*
FROM
excl_self_reverts ;
============ - enwiki - ============
- Raw data shape: (0, 17)
- Duplicate rows found and removed: 0
- Clean data shape: (0, 17)
- Unique revision_ids: 0 | Data Shape: 0 | Same? : -> True
- Removing edits that are reverts from df | New Shape: (0, 17)
- Is any revert_risk_score NA? : False
- Is any user_edit_count NA? : False
- Is any time_to_revert NA? : False
Traceback (most recent call last):
File "/srv/home/gkyziridis/revert_risk_threshold_for_all_wikies.py", line 392, in <module>
rc = ROC(actual=actual, probs=predicted, plot=True, name=wiki_name)
File "/srv/home/gkyziridis/revert_risk_threshold_for_all_wikies.py", line 113, in ROC
fpr, tpr, thresholds = roc_curve(actual, probs)
File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
return func(*args, **kwargs)
File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 1150, in roc_curve
fps, tps, thresholds = _binary_clf_curve(
File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 835, in _binary_clf_curve
pos_label = _check_pos_label_consistency(pos_label, y_true)
File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/utils/validation.py", line 2641, in _check_pos_label_consistency
raise ValueError(
ValueError: y_true takes value in {} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.