User:ChristieBot/Update historical GAs data.py


 * 1) This script tries to keep the historical_GA_reviews table up to date.
 * 2) It works as follows
 * 3) 1. Get a set of GA pages -- usually ones created or moved since the last run
 * 4) 2. Go through those and if the data for them is already in the history table, correct the reviewer and review_ts if necessary
 * 5) 3. Look through them again and this time insert any records not in the history table
 * 6) 4. Set the "needs_analysis" flag.  This is usually used to determine which records should be analysed and updated.
 * 7) 5. Use the where clause to determine what records will actually be analysed
 * 8) 6. Loop through the list of records and try various ways to determine what the values in the history table should be set to.

import pywikibot import re import datetime import sys import os import pymysql import configparser import operator from pywikibot.data.api import PropertyGenerator import time from dateutil.parser import parse
 * 1) Third party modules

sys.path.append('./www/python/src') # Not needed if I run from that directory from GA import Topic, Subtopic, Nom, Review_stats, WBGAN, Active_nomination, GAN, Name_changes, Nom_list import GA_config from GA_history import GAH, FailedGA, GAnominee, Article_History, GA_article_page, GARlink, GA_talk_page, GA_sub_page, GA_history_Exception, GAO
 * 1) Local modules
 * 1) import GA_config_test as GA_config

HOME=os.environ.get('HOME') #get environment variable $HOME replica_path=HOME + '/replica.my.cnf' if os.path.exists(replica_path):         #check that the file is found config = configparser.ConfigParser config.read(replica_path) else: print('replica.my.cnf file not found')
 * 1) Config

site = pywikibot.Site('en','wikipedia') database = "s55175__ganfilter" conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database="s55175__ganfilter", host='tools.db.svc.eqiad.wmflabs') max_review_ts_str = GAH.get_max_review_ts_str(conn, config) override_sql = True sql = "" sql = 'SELECT p.page_title FROM page p inner join logging_logindex l on p.page_id = l.log_page \ where log_type = "move" and page_namespace=1 AND page_title like "%/GA_" \ and l.log_timestamp > "' + max_review_ts_str + '" \ union \ SELECT page_title FROM page p inner join revision r on p.page_id = r.rev_page \ WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '" and rev_timestamp <= "2023-03-10"' sql = "select p.page_title from page p where p.page_title like 'Sovetsky/GA%' and p.page_namespace = 1" GAH.find_incomplete_moves(conn, config)
 * 1) Note: use underscores instead of spaces in page names
 * 1) sql = 'SELECT page_title FROM page p WHERE page_namespace=1 and page_title LIKE "Twerton_Park%/GA2"'
 * 2) The next method finds any moves that happened in the last 24 hours (usually) for which the source page is in the historical database, and it does two things:
 * 3) It sets the "needs_analysis" flag on in the database, and it outputs a report to the incomplete moves page for a human to review.

rows_to_check = GAH.get_rows_to_check(conn, config, max_review_ts_str, sql, override_sql)
 * 1) get_rows_to_check will get all pages that have moved since max_review_ts, plus all pages that have been created since max_review_ts by default.
 * 2) To override this, set override_sql = True and pass a query in sql which will be used instead
 * 3) rows_to_check is the list of GA subpages that need to be reviewed.

GAH.check_reviewer_data(conn, config, rows_to_check)
 * 1) First check that the reviewer information is correct before we insert them into the historical database
 * 2) Any records in the historical database that don't agree with the creation date and creating editor for the review page will be updated

GAH.scan_for_new_pages(conn, config, rows_to_check)
 * 1) Now we know the database doesn't have any incorrect data for the GA pages in rows_to_check
 * 2) Now insert into the historical database any record in rows_to_check that is not already there

sql = None need_analysis_count = GAH.set_needs_analysis_flag(conn, config, max_review_ts_str, sql)
 * 1) By default set_needs_analysis_flag will set the flag for all pages that have been moved, created, or edited since max_review_ts.
 * 2) Pass in sql as a query string to override this.  It should return a list of article titles.

conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database="s55175__ganfilter", host='tools.db.svc.eqiad.wmflabs') cursor = conn.cursor(pymysql.cursors.DictCursor) where_clause = "where needs_analysis = 'Y'" sql = "select article_title, page, review_ts, type, comments, outcome, outcome_ts, nominator, nomination_ts, reviewer, subtopic from " + GA_config.strings['historical GA reviews table name'] + " " + where_clause cursor.execute(sql) ctr = 0 tdelta = datetime.timedelta(0,60) # one minute to use to start searching for revisions for sql_row in cursor.fetchall: row = sql_row print("Article: " + row['article_title']) subp = GA_sub_page(pywikibot.Page(site, "Talk:" + row['article_title'] + "/GA" + str(row['page']))) talkp = GA_talk_page(pywikibot.Page(site, "Talk:" + row['article_title'])) articlep = GA_article_page(pywikibot.Page(site, row['article_title'])) searchp = talkp has_been_set = {'type': False, 'nominator': False, 'nomination_ts': False, 'subtopic': False, 'outcome': False, 'outcome_ts': False, 'comments': False} subp.reset_attributes(conn) if subp.assess_state(conn, row, has_been_set, talkp, searchp, articlep, subp): subp.unset_needs_analysis_flag(conn) else: #print("Before ah, has_been_set is " + str(has_been_set)) subp.try_article_history_update(conn, searchp, has_been_set, row) #print("Before GAN_page, has_been_set is " + str(has_been_set)) subp.try_GAN_page_revisions(conn, talkp, has_been_set, row) #print("Before fga, has_been_set is " + str(has_been_set)) subp.try_failed_GA_update(conn, talkp, has_been_set, row) #print("Before dga, has_been_set is " + str(has_been_set)) subp.try_delisted_GA_update(conn, talkp, has_been_set, row) #print("Before wbgan, has_been_set is " + str(has_been_set)) subp.try_WBGAN(conn, talkp, has_been_set, row, config) #print("Before GAR_headers, has_been_set is " + str(has_been_set)) subp.check_for_GAR_headers(conn, row, has_been_set) #print("Before under_review, has_been_set is " + str(has_been_set)) subp.check_for_under_review(conn, row, has_been_set) #print("Before tpr, has_been_set is " + str(has_been_set)) subp.try_talk_page_revisions(conn, tdelta, searchp, has_been_set, row) #print("Before GA, has_been_set is " + str(has_been_set)) subp.try_GA(conn, searchp, has_been_set, row) subp.unset_needs_analysis_flag(conn) ctr += 1 if ctr % 10 == 0: print("Processed " + str(ctr) + " articles")
 * 1) This is the second half of the code and it can be run independently; this is the part that cleans up the history table
 * 2) If the needs_analysis flag is set on exactly the ones you want to update, then just run with the default where clause.
 * 1) where_clause = "where article_title collate utf8mb4_bin = 'Helene Scheu-Riesz' and page = 1"
 * 2) where_clause = "where type is null"
 * 1) print(sql)