User:ChristieBot/GA history.py

''' Copyright (c) 2022 Mike Christie Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. '''

import urllib.parse import re import datetime import pywikibot import pymysql import operator import GA_config import sys import time from dateutil.parser import parse sys.path.append('./www/python/src') # Not needed if I run from that directory from GA import Topic, Subtopic, Nom, Review_stats, WBGAN, Active_nomination, GAN, Name_changes, Nom_list

class GAH:

@classmethod def update_historical_GA_reviews(cls, conn, row, fields, has_been_set): # This is called with a set of fields to update a row in the historical table # Has been set is a dictionary of true/false flags to track if anything has been changed valid_fields = ['comments','nomination_ts','nominator','outcome','outcome_ts','subtopic','type'] sql_base = "update " + GA_config.strings['historical GA reviews table name'] + " " sql_sets = "" #print("Called update:") #print(fields) if fields == {} or fields is None: # Nothing to do           return 0 if 'nomination_ts' in fields.keys: fields['nomination_ts'] = GAH.convert_timestamp_to_datetime(fields['nomination_ts']) for k in fields.keys: if not has_been_set[k]: # This loop assembles part of the update statement, depending on whether the value is int, string, or null if k not in valid_fields: GAN.notify_error(conn,"GAH.update_historical_GA_reviews",row['article_title'],"Invalid field " + str(k) + " supplied") return 0 else: if fields[k] is not None and fields[k] != 'null': has_been_set[k] = True else: has_been_set[k] = False if sql_sets == "": sql_sets = "set " else: sql_sets += ", " if isinstance(fields[k], int): sql_sets += k + " = '" + str(fields[k]) + "'" elif fields[k] is None or fields[k] == 'null': sql_sets += k + " = null" elif isinstance(fields[k], str): sql_sets += k + " = '" + fields[k].replace("'","''").strip + "'" else: sql_sets += k + " = '" + str(fields[k]) + "'" if sql_sets == "": return 0 sql_where = " where article_title collate utf8mb4_bin = '" + row['article_title'].replace("'","''") + "' and page = " + str(row['page']) + " and review_ts = '" + str(row['review_ts']) + "'" sql = sql_base + sql_sets + sql_where GAN.log(conn, "GAH.update_historical_GA_reviews",row['article_title'],sql) #print(sql) cursor = conn.cursor(pymysql.cursors.DictCursor) try: records_updated = cursor.execute(sql) except pymysql.err.ProgrammingError as e:           conn.rollback GAN.notify_error(conn, "GAH.update_historical_GA_reviews",row['article_title'],sql) return 0 if records_updated > 1: conn.rollback GAN.notify_error(conn, "GAH.update_historical_GA_reviews",row['article_title'],"More than one record updated! by <" + sql + ">") elif records_updated == 1: conn.commit return records_updated

@classmethod def get_gar_dict(cls, conn, config): site = pywikibot.Site('en','wikipedia') cat_articles = [] cat = pywikibot.Category(site,'Good article reassessment nominees') # We're going to iterate over these; this is the contents of GAN. for x in cat.articles: title = x.title[5:] if title not in cat_articles: cat_articles.append(title)

gar_dict = {} for GAR_nominee in cat_articles: subtopic = 'Miscellaneous' GAR_tp = pywikibot.Page(site, "Talk:" + GAR_nominee) gar = GAH.get_template_list_from_text(GAR_tp.text, r"") nominee_dict = [GAR_nominee, {'GARpage': None, 'orignom': None, 'GARdate': None, 'shortdesc': None}] if gar is not None: garlink = GARlink(gar[0], GAR_tp.latest_revision_id, GAR_tp) nominee_dict = [GAR_nominee, {'GARpage': garlink.GAR_page_num, 'orignom': None, 'GARdate': garlink.date, 'shortdesc': garlink.shortdesc}] gar_sql = "select nominator from historical_GA_reviews where article_title = '" + GAR_nominee.replace("'","") + "' and page = (select max(page) from historical_GA_reviews where article_title = '" + GAR_nominee.replace("'","") + "' and type = 'GAN')" gar_cursor = conn.cursor(pymysql.cursors.DictCursor) gar_nom_count = gar_cursor.execute(gar_sql) if gar_nom_count > 0: gar_row = gar_cursor.fetchone gar_nominee = gar_row['nominator'] nominee_dict[1]['orignom'] = gar_nominee aht = GAN.get_article_history_template(GAR_tp.text) if aht is not None: ah = Article_History(conn, aht, GAR_tp.latest_revision_id, GAR_tp) if ah.topic is not None and ah.topic != '': subtopic = ah.topic else: gat = GAH.get_template_list_from_text(GAR_tp.text, r"") if gat is not None and gat != []: #GAN.log(conn,"get_gar_dict", None, "gat is " + str(gat)) #GAN.log(conn,"get_gar_dict", None, "GAR_nominee is " + GAR_nominee) gao = GAO(gat[0], GAR_tp.latest_revision_id, GAR_tp) if gao.subtopic is not None and gao.subtopic != '': subtopic = gao.subtopic elif gao.topic is not None and gao.topic != '': subtopic = gao.topic if subtopic.lower in Subtopic.subtopic_var_dict.keys: subtopic = Subtopic.subtopic_var_dict[subtopic.lower] if subtopic in gar_dict.keys: gar_dict[subtopic].append(nominee_dict) else: gar_dict[subtopic] = [nominee_dict] return(gar_dict)

@classmethod def convert_datetime_to_YMDHMS(cls, dt): dt_str = str(dt.year) + ('0' + str(dt.month))[-2:] + ('0' + str(dt.day))[-2:] + ('0' + str(dt.hour))[-2:] + ('0' + str(dt.minute))[-2:] + ('0' + str(dt.second))[-2:] return(dt_str)

@classmethod def convert_timestamp_to_datetime(cls, ts): if ts is None: #print("Couldn't convert") return None else: if type(ts) is datetime.datetime: cts = ts           elif type(ts) is datetime.date: cts = datetime.datetime(year = ts.year, month = ts.month, day = ts.day) else: cts = datetime.datetime(year = ts.year, month = ts.month, day = ts.day, hour = ts.hour, minute = ts.minute, second = ts.second) #print("Converted timestamp " + str(ts) + " to " + str(cts)) return(cts.replace(tzinfo=None))

@classmethod def sub_page_missing(cls, conn, article_title, page_num): if page_num not in ['1','2','3','4','5','6','7','8','9','0']: return False else: sql = "select article_title, page from " + GA_config.strings['historical GA reviews table name'] + " where article_title collate utf8mb4_bin = '" + article_title.replace("'","''") + "' and page = " + str(page_num) GAN.log(conn, "sub_page_missing",article_title,"sql is " + sql) cursor = conn.cursor(pymysql.cursors.DictCursor) count = cursor.execute(sql) cursor.close if count == 0: return True return False

@classmethod def get_latest_non_bot_revision_timestamp(cls, sub_page): #TODO change to a method of GA_sub_page last_non_bot_user = sub_page.lastNonBotUser latest_revision_timestamp = sub_page.latest_revision.timestamp for sub_rev in sub_page.revisions: if sub_rev['user'] == last_non_bot_user: latest_revision_timestamp = sub_rev['timestamp'] break return(latest_revision_timestamp)

@classmethod def find_incomplete_moves(cls, conn, config): #TODO change the timestamp to use previous UTC day midnight to midnight if utcnow is between UTC midnight and 00:20. #TODO ignore moves from any namespace other than 0 now = datetime.datetime.utcnow # The next two lines avoid this code running over and over; they only allow execution in the first few minutes after midnight #if now.hour != 0 or now.minute > 19: #   return(None) prev_midnight = now.replace(hour=0,minute=0,second=0,microsecond=0) #prev_midnight = prev_midnight - datetime.timedelta(days=315) prev_midnight_minus_a_day = prev_midnight - datetime.timedelta(days=1) prev_midnight = GAH.convert_datetime_to_YMDHMS(prev_midnight) prev_midnight_minus_a_day = GAH.convert_datetime_to_YMDHMS(prev_midnight_minus_a_day) site = pywikibot.Site("en","wikipedia") wpDatabase = "enwiki_p" wp_conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database=wpDatabase, host='enwiki.analytics.db.svc.wikimedia.cloud') # Find all pages that were moved between "prev_midnight_minus_a_day" and "prev_midnight" # This may need to be updated to remove round-robin moves, which use the Draft namespace. wp_sql = "SELECT l.log_title as source_title, p.page_title as target_title, l.log_timestamp as move_timestamp FROM page p inner join logging_logindex l on p.page_id = l.log_page " wp_sql += "where log_type = 'move'and page_namespace = 1 " wp_sql += "and l.log_timestamp between '" + str(prev_midnight_minus_a_day) + "' and '" + str(prev_midnight) + "' order by l.log_title" #wp_sql = "SELECT l.log_title as source_title, p.page_title as target_title FROM page p inner join logging_logindex l on p.page_id = l.log_page \       #where log_type = 'move'and page_namespace = 1 and l.log_title \        #and l.log_timestamp between '" + str(prev_midnight_minus_a_day) + "' and '" + str(prev_midnight) + "' order by l.log_title" #print(wp_sql) wp_cursor = wp_conn.cursor(pymysql.cursors.DictCursor) move_count = wp_cursor.execute(wp_sql) incomplete_moves = [] for row in wp_cursor.fetchall: source_article_name = row['source_title'].decode("UTF8").replace("_"," ") target_article_name = row['target_title'].decode("UTF8").replace("_"," ") move_timestamp = row['move_timestamp'].decode("UTF8") # Find every article in the historical GA reviews table that has been moved according to the query above # TODO This needs to restrict the moves to ones that happened after the GA nomination but the historical GA reviews entries need to be fixed first. sql = "select article_title, page from " + GA_config.strings['historical GA reviews table name'] + " where article_title collate utf8mb4_bin = '" + source_article_name.replace("'","''") + "'" #sql = "select article_title, page from " + GA_config.strings['historical GA reviews table name'] + " where nomination_ts > '" + str(move_timestamp) + "' and article_title collate utf8mb4_bin = '" + source_article_name.replace("'","''") + "'" cursor2 = conn.cursor(pymysql.cursors.DictCursor) num_sub_pages = cursor2.execute(sql) GAN.log(conn, "find_incomplete_moves: finding",None,sql + " returned " + str(num_sub_pages) + " sub pages") if num_sub_pages > 0: GAN.log(conn, "find_incomplete_moves: finding",None,"num_sub_page > 0") sub_page_move_list = [] for row2 in cursor2.fetchall: GAN.log(conn, "find_incomplete_moves: finding",source_article_name,"in loop for sub pages") sub_page_page_num = row2['page'] sub_page_title_to_check = "Talk:" + source_article_name + "/GA" + str(sub_page_page_num) sub_page_to_check = pywikibot.Page(site, sub_page_title_to_check) if sub_page_to_check.exists: if not GAN.is_redirect(sub_page_to_check): sub_page_move_target = "Talk:" + target_article_name + "/GA" + str(sub_page_page_num) sub_page_move_list.append([sub_page_title_to_check, sub_page_move_target]) if sub_page_move_list != []: GAN.log(conn, "find_incomplete_moves: finding",source_article_name,"Adding to incomplete moves list") incomplete_moves.append([source_article_name, target_article_name, sub_page_move_list]) sql = "update " + GA_config.strings['historical GA reviews table name'] + " set needs_analysis = 'Y' where article_title = '" + source_article_name.replace("'","''") + "'" cursor3 = conn.cursor(pymysql.cursors.DictCursor) cursor3.execute(sql) conn.commit # The following code is commented out because Aidan9382-Bot is now automatically fixing incomplete moves. I am leaving the code in place in Case # it is needed in the future, e.g. if that bot stops running. #       #if len(incomplete_moves) > 0: #   temp_list = [] # Get rid of duplicate entries #   for i in range(0,len(incomplete_moves)-1): #       if incomplete_moves[i] != incomplete_moves[i+1]: #           temp_list.append(incomplete_moves[i]) #   temp_list.append(incomplete_moves[len(incomplete_moves)-1]) #   incomplete_moves = temp_list #   move_error_page = pywikibot.Page(site, GA_config.strings['Moved pages that did not move GA subpages']) #   error_text = move_error_page.text #   # TODO change the timestamp on the header to show the date range covered #   error_text += "\n\n==" + str(datetime.datetime.utcnow)[:19] + "==\n" #   #print(incomplete_moves) #   write_flag = False #   for i in incomplete_moves: #       GAN.log(conn, "find_incomplete_moves: reporting",None,"checking incomplete moves for ones to report") #       err_flag = False #       if str(i[0]) != str(i[1]): #           temp_text = "* " + str(i[0]) + " -> " + str(i[1]) + "\n" #           for s in i[2]: #               if str(s[0]) != str(s[1]): #                   temp_text += "** " + str(s[0]) + " -> " + str(s[1]) + "\n" #                   err_flag = True #                   write_flag = True #           if err_flag: #               error_text += temp_text #   if write_flag: #       move_error_page.text = error_text #       move_error_page.save("Updated list of incomplete moves")

@classmethod def get_max_review_ts_str(cls, conn, config): max_ts_sql = "select max(review_ts) as max_review_ts from " + GA_config.strings['historical GA reviews table name'] cursor = conn.cursor(pymysql.cursors.DictCursor) ts_count = cursor.execute(max_ts_sql) ts_row = cursor.fetchone max_review_ts = ts_row['max_review_ts'] max_review_ts_str = str(max_review_ts.year) + ("0" + str(max_review_ts.month))[-2:] + ("0" + str(max_review_ts.day))[-2:] + ("0" + str(max_review_ts.hour))[-2:] + "0000" return(max_review_ts_str)

@classmethod def set_needs_analysis_flag(cls, conn, config, max_review_ts_str, sql): # Sets the "needs analysis" flag on the historical GA reviews table. Uses a query outlined below but will use the query # passed in sql if it is not None u_conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database="s55175__ganfilter", host='tools.db.svc.eqiad.wmflabs') cursor = u_conn.cursor(pymysql.cursors.DictCursor) wpDatabase = "enwiki_p" wp_conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database=wpDatabase, host='enwiki.analytics.db.svc.wikimedia.cloud') #wp_sql = 'SELECT page_title, r.rev_timestamp FROM page p inner join revision r on p.page_id = r.rev_page WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '"' #wp_sql = 'SELECT page_title FROM page p WHERE page_namespace=1 and page_title LIKE "%/GA_" and p.page_touched >= "' + max_review_ts_str + '"'

# The following SQL unions three queries. All pages with a name ending in /GAn where 1. the page has been moved since max_review_ts (mrt) or       # 2. the page was been created since max_review_ts or 3. the page has been modified since max_review_ts if sql is None: wp_sql = 'SELECT p.page_title FROM page p ' wp_sql += 'inner join logging_logindex l on p.page_id = l.log_page ' wp_sql += 'where log_type = "move" ' wp_sql += 'and page_namespace =1 AND page_title like "%/GA_" ' wp_sql += 'and l.log_timestamp > "' + max_review_ts_str + '" ' wp_sql += 'union ' wp_sql += 'SELECT page_title FROM page p inner join revision r on p.page_id = r.rev_page ' wp_sql += 'WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '" ' wp_sql += 'union ' wp_sql += 'select page_title from page p inner join revision r ' wp_sql += 'on p.page_latest = r.rev_id AND p.page_title LIKE "%/GA_" and p.page_namespace = 1 and r.rev_timestamp > "' + max_review_ts_str + '"' else: wp_sql = sql wp_cursor = wp_conn.cursor(pymysql.cursors.DictCursor) wp_count = wp_cursor.execute(wp_sql) #print("Found " + str(wp_count) + " that need analysis") for row in wp_cursor.fetchall: # set needs_analysis to "Y" # Note that if this query found records that were not inserted into the history table there will be no update sql = "update " + GA_config.strings['historical GA reviews table name'] + " set needs_analysis = 'Y' where article_title = '" + row['page_title'].decode("UTF8").replace("_"," ").replace("'","''")[:-4] + "'" update_count = cursor.execute(sql) u_conn.commit return(wp_count)

@classmethod def get_rows_to_check(cls, conn, config, max_review_ts_str, sql, override_sql): # By default this gets all pages that have moved since max_review_ts, plus all pages that have been created since max_review_ts # If override_sql is True, then the query in "sql" is used instead. This can be useful when you want to extract a different list for some reason # perhaps to reprocess those pages, or to do a subset for debugging or to break up a large run. wpDatabase = "enwiki_p" wp_conn = pymysql.connections.Connection(user=config['client']['user'], password=config['client']['password'], database=wpDatabase, host='enwiki.analytics.db.svc.wikimedia.cloud') #wp_sql = 'SELECT page_title, r.rev_timestamp FROM page p inner join revision r on p.page_id = r.rev_page WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '"' #wp_sql = 'SELECT page_title FROM page p WHERE page_namespace=1 and page_title LIKE "%/GA_" and p.page_touched >= "' + max_review_ts_str + '"' wp_sql = 'SELECT p.page_title FROM page p inner join logging_logindex l on p.page_id = l.log_page \ where log_type = "move" and page_namespace=1 AND page_title like "%/GA_" \ and l.log_timestamp > "' + max_review_ts_str + '" \ union \ SELECT page_title FROM page p inner join revision r on p.page_id = r.rev_page \ WHERE page_namespace=1 and r.rev_parent_id = 0 AND page_title LIKE "%/GA_" and rev_timestamp >= "' + max_review_ts_str + '"' #wp_sql = 'SELECT page_title FROM page p WHERE page_namespace=1 and page_title LIKE "%/GA_" and p.page_touched >= "202302130000"' if override_sql: wp_sql = sql wp_cursor = wp_conn.cursor(pymysql.cursors.DictCursor) wp_count = wp_cursor.execute(wp_sql) print("Found " + str(wp_count) + " to check") wp_rows = wp_cursor.fetchall return(wp_rows)

@classmethod def scan_for_new_pages(cls, conn, config, rows): # rows is a list of GA subpages. Each is checked to see if it's already in the historical database and inserted if it isn't       cursor = conn.cursor(pymysql.cursors.DictCursor) ctr = 0 for wp_row in rows: ctr += 1 if ctr % 10 == 0: GAN.log(conn,"scan_for_new_pages", None, "Checking " + str(ctr) + " new GA subpages") #print("Checking " + str(ctr) + " new GA subpages") name_space_title = wp_row['page_title'].decode("UTF8").replace("_"," ")      # E.g. "Tree/GA1" GAN.log(conn,"scan_for_new_pages", name_space_title, "Checking " + name_space_title) sub_page_title = "Talk:" + name_space_title                                  # E.g  "Talk:Tree/GA1" talk_page_title = sub_page_title[:-4]                                        # E.g  "Talk:Tree" article_title = talk_page_title[5:]                                          # E.g. "Tree" page_num = name_space_title[-1:] if page_num not in ['1','2','3','4','5','6','7','8','9','0']: continue # This skips subpages named e.g. "../GAR" site = pywikibot.Site('en','wikipedia') sub_page = pywikibot.Page(site, "Talk:" + article_title + "/GA" + str(page_num)) if not sub_page.exists: continue # Must have been deleted since the run started reviewer = sub_page.oldest_revision['user'] rp_ts = sub_page.oldest_revision['timestamp'] review_ts = datetime.datetime(year = rp_ts.year, month = rp_ts.month, day = rp_ts.day, hour = rp_ts.hour, minute = rp_ts.minute, second = rp_ts.second) if GAH.sub_page_missing(conn, article_title, page_num): GAN.log(conn,"scan_for_new_pages", article_title,"inserting into historical reviews table") sql = "insert into " + GA_config.strings['historical GA reviews table name'] + " (reviewer, article_title, page, review_ts, needs_analysis) values ('" + reviewer.replace("'","") + "','" + article_title.replace("'","") + "'," + page_num + ",'" + str(review_ts) + "',null)" print(sql) n = cursor.execute(sql) conn.commit

@classmethod def check_reviewer_data(cls, conn, config, rows): # rows contains a list of page titles of GA subpages. ctr = 0 for wp_row in rows: ctr += 1 if ctr % 100 == 0: print("Checking " + str(ctr) + " GA subpages") name_space_title = wp_row['page_title'].decode("UTF8").replace("_"," ")      # E.g. "Tree/GA1" #print("Checking " + name_space_title) #print(wp_row) sub_page_title = "Talk:" + name_space_title                                  # E.g  "Talk:Tree/GA1" talk_page_title = sub_page_title[:-4]                                        # E.g  "Talk:Tree" article_title = talk_page_title[5:]                                          # E.g. "Tree" page_num = name_space_title[-1:] if page_num not in ['1','2','3','4','5','6','7','8','9','0']: continue # This skips subpages named e.g. "../GAR" site = pywikibot.Site('en','wikipedia') sub_page = pywikibot.Page(site, sub_page_title) reviewer = sub_page.oldest_revision['user'] rp_ts = sub_page.oldest_revision['timestamp'] review_ts = datetime.datetime(year = rp_ts.year, month = rp_ts.month, day = rp_ts.day, hour = rp_ts.hour, minute = rp_ts.minute, second = rp_ts.second) # Get the information in the historical_GA_reviews table for the reviewer and review_ts db_review_info = GAH.get_review_info(conn, article_title, page_num) if db_review_info is None: GAN.log(conn,"check_reviewer_data", article_title,"No db reviewer found for " + article_title + " / " + str(page_num)) else: GAN.log(conn,"check_reviewer_data", article_title,"db reviewer is " + db_review_info[0] + " for " + article_title + " / " + str(page_num)) GAN.log(conn,"check_reviewer_data", article_title,"reviewer is " + reviewer + " for " + article_title + " / " + str(page_num)) # If the historical table doesn't have the right information, update it           if db_review_info is not None: if reviewer != db_review_info[0]: GAN.log(conn,"check_reviewer_data", article_title,"Looked up reviewer " + reviewer + " does not match db reviewer " + db_review_info[0] + " for " + article_title + " / " + str(page_num)) #print sql_u = "update " + GA_config.strings['historical GA reviews table name'] + " set reviewer = '" + reviewer.replace("'","") + "' where article_title collate utf8mb4_bin = '" + article_title.replace("'","") + "' and page = " + str(page_num) GAN.log(conn,"check_reviewer_data", article_title,"sql u is " + sql_u) cursor = conn.cursor(pymysql.cursors.DictCursor) u = cursor.execute(sql_u) GAN.log(conn,"check_reviewer_data", article_title,"u is " + str(u)) conn.commit if review_ts != db_review_info[1]: #print("Looked up review_ts " + str(review_ts) + " does not match db review_ts " + str(db_review_info[0]) + " for " + article_title + " / " + str(page_num)) sql_u = "update " + GA_config.strings['historical GA reviews table name'] + " set review_ts = '" + str(review_ts) + "' where article_title collate utf8mb4_bin = '" + article_title.replace("'","''") + "' and page = " + str(page_num) print(sql_u) cursor = conn.cursor(pymysql.cursors.DictCursor) u = cursor.execute(sql_u) conn.commit

@classmethod def get_review_info(cls, conn, article_title, page_num): # In use 2023-02-25 sql = "select reviewer, review_ts from " + GA_config.strings['historical GA reviews table name'] + " where article_title collate utf8mb4_bin = '" + article_title.replace("'","''") + "' and page = " + str(page_num) #print("In get_review_info") #print(sql) GAN.log(conn,"get_review_info", article_title,'Called') cursor = conn.cursor(pymysql.cursors.DictCursor) count = cursor.execute(sql) if count == 0: GAN.log(conn,"get_review_info", article_title,"Can't find reviewer info record for " + article_title + " / " + str(page_num)) elif count > 1: GAN.log(conn,"get_review_info", article_title,"Found more than one record for " + article_title + " / " + str(page_num)) else: row = cursor.fetchone GAN.log(conn,"get_review_info", article_title,"Found one record for " + article_title + " / " + str(page_num)) return (row['reviewer'], row['review_ts']) cursor.close

@classmethod def get_latest_reviewer_revision_timestamp(cls, page, reviewer): # In use 2023-02-25 # TODO change to a method of GA_sub_page latest_revision_timestamp = page.latest_revision.timestamp for sub_rev in page.revisions: if sub_rev['user'] == reviewer: latest_revision_timestamp = sub_rev['timestamp'] break return(latest_revision_timestamp)

@classmethod def strip_comments(cls, text): # In use 2023-02-25 new_text = text com_re = re.search(r"\<\!\-\-[^\-]*\-\-\>",new_text) while com_re is not None: #print("******Before stripping") #print(new_text) new_text = new_text[:com_re.span(0)[0]] + new_text[com_re.span(0)[1]:] #print("******After stripping") #print(new_text) com_re = re.search(r"\<\!\-\-[^\-]*\-\-\>",new_text) #print("re:") #print(com_re) return(new_text)

@classmethod def get_link_redirect_target(cls, link): # In use 2023-02-25 site = pywikibot.Site('en','wikipedia') page = pywikibot.Page(site,GAH.strip_comments(link)) try: page.exists except pywikibot.exceptions.UnsupportedPageError as e:           return(None) if page.exists: try: page_text = page.get return(None) # if not an error, then this is not a redirect except pywikibot.exceptions.IsRedirectPageError as e:               redir_target = page.getRedirectTarget return(redir_target) except pywikibot.exceptions.InvalidTitleError as e:               return(None) else: return(None)

@classmethod def get_param_value_pair(cls, param_text): equals = param_text.find("=") if equals is not None: pair = [param_text[:equals].strip, param_text[equals+1:].strip] return(pair) else: return(None)

@classmethod def get_piped_elements(cls,template_text): # In use 2023-02-25 text = template_text element_list = [] # Strip the braces if they're there if text[:2] == "": text = text[:-2] element = "" template_depth = 0 left_brace_found = False right_brace_found = False left_bracket_found = False right_bracket_found = False for letter in text: if template_depth == 0 and letter == "|": element_list.append(element) element = "" else: element += letter if letter == "{": if left_brace_found is True: template_depth += 1 left_brace_found = False else: left_brace_found = True elif letter == "}": if right_brace_found is True: template_depth -= 1 right_brace_found = False else: right_brace_found = True elif letter == "[": if left_bracket_found is True: template_depth += 1 left_bracket_found = False else: left_bracket_found = True elif letter == "]": if right_bracket_found is True: if template_depth > 0: template_depth -= 1 right_bracket_found = False else: right_bracket_found = True element_list.append(element) return(element_list)

@classmethod def find_enclosed_string(cls, string): # Used to search for matching right brace pairs when getting params. # In use 2023-02-25 #print("String passed to find_enclosed_string is <" + string + ">") left_brace_pair_cnt = 0 enclosed_str_range = [0, 0] for i, s in enumerate(string): s2 = (string[i:i+2]) if s2 == "": left_brace_pair_cnt -= 1 if left_brace_pair_cnt == 0: enclosed_str_range[1] = i           if enclosed_str_range[1] > enclosed_str_range[0]: return(string[enclosed_str_range[0]:enclosed_str_range[1]+2]) @classmethod def get_template_list_from_text(cls, page_text, search_pattern): # In use 2023-02-25 text = GAH.strip_comments(page_text) #templates_re = re.search("",text,re.IGNORECASE) templates_re = re.search(search_pattern,text,re.IGNORECASE) template_list = [] while templates_re is not None: templates_span = templates_re.span(0) text = text[templates_span[0]:] template_text = GAH.find_enclosed_string(text) template_list.append(template_text) if text is None or template_text is None: break text = text[len(template_text):] templates_re = re.search(search_pattern,text,re.IGNORECASE) return template_list

@classmethod def evaluate_page_classes(cls, articlep, talkp, subp, conn, row, has_been_set): # In use 2023-02-25 continue_flag = False # TODO Could collapse a lot of this by removing the type=null and moving all the updates to the end, just setting comments in each if       searchp = talkp site = pywikibot.Site("en","wikipedia") print("Called evaluate_page_classes with " + articlep.page_class + "; " + talkp.page_class) if articlep.page_class == "Normal": if talkp.page_class == "Normal": pass # No problems elif talkp.page_class == "Redirect": record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Talk page is a redirect but the article page is not'}, has_been_set) elif talkp.page_class == "Disambiguation": record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Talk page is a disambiguation page but the article page is not'}, has_been_set) elif talkp.page_class == "No page": record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Talk page does not exist'}, has_been_set) elif articlep.page_class == "Disambiguation": record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article page is a disambiguation page'}, has_been_set) elif articlep.page_class == "Redirect": if talkp.page_class == "Normal": record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article page is a redirect but the talk page is not'}, has_been_set) elif talkp.page_class == "Redirect": article_redirects_to = articlep.page.getRedirectTarget.title hash_offset = article_redirects_to.find("#") if hash_offset > 0: article_redirects_to = article_redirects_to[:hash_offset] talk_page_redirects_to = talkp.page.getRedirectTarget.title hash_offset = talk_page_redirects_to.find("#") if hash_offset > 0: talk_page_redirects_to = talk_page_redirects_to[:hash_offset] if "Talk:" + article_redirects_to == talk_page_redirects_to: record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article and talk pages are redirects; GA page should be moved'}, has_been_set) # Write an audit trail message giving the details. audit_msg = "Suggest move of GA subpage: |" + subp.page.title + "||[[" + talkp.page.getRedirectTarget.title + "/GA" + str(row['page'])                   GAN.log(conn,"GAH.evaluate_page_classes", row['article_title'],audit_msg)                else:                    #TODO check for a hash mark and remove text from that point before the comparison.                    record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article and talk pages are redirects but to different pages'}, has_been_set)                    audit_msg = "Conflicting redirect targets: |" + subp.page.title + "|" + talkp.page.getRedirectTarget.title + "/GA" + str(row['page']) + "|" + "Talk:" + articlep.page.getRedirectTarget.title                    GAN.log(conn,"Find nominators from article revisions", row['article_title'],audit_msg)            elif talkp.page_class == "Disambiguation":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Talk page is a disambiguation page but the article page is a redirect'}, has_been_set)            elif talkp.page_class == "No page":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article is a redirect and talk page does not exist; GA page should probably be moved'}, has_been_set)                search_page_title = "Talk:" +articlep.page.getRedirectTarget.title                search_page = pywikibot.Page(site, search_page_title)                if not searchp.page.exists: # TODO do we have to continue?  Might be ok to let it run                    continue_flag = True                else:                    searchp = GA_talk_page(search_page)        elif articlep.page_class == "No page":            if talkp.page_class == "Normal":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article page does not exist but talk page exists'}, has_been_set)            elif talkp.page_class == "Redirect":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Article does not exist and talk page is a redirect; GA page should probably be moved'}, has_been_set)                search_page_title = talkp.page.getRedirectTarget.title                search_page = pywikibot.Page(site, search_page_title)                if not search_page.exists: # TODO do we have to continue?  Might be ok to let it run                    continue_flag = True                else:                    searchp = GA_talk_page(search_page)            elif talkp.page_class == "Disambiguation":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Talk page is a disambiguation page and the article does not exist'}, has_been_set)            elif talkp.page_class == "No page":                record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'Neither the article page nor the talk page exists'}, has_been_set)        return((continue_flag, searchp))

@classmethod def get_GAN_entry_span(cls, GAN_page_text, title): # In use 2023-02-25 escaped_title = re.escape(title) # The raw strings are to ensure that the title we've found (a) is the entire article name (otherwise we match XXabcYY when searching for abc) and (b) has a user name on the line entry_re = re.search(r"[=\|]\s*" + escaped_title + r"\s*[\|}]" + r".*\[\]*\|", GAN_page_text)       #entry_re = re.search(escaped_title + r".*\[\[[uU]ser[^\|]*\|", GAN_page_text)        if entry_re is None:            entry_re = re.search(r"[=\|]\s*" + escaped_title + r"\s*[\|}]" + r".*\[\[[uU]ser[^\:]*:[^\*\]", GAN_page_text)            #entry_re = re.search(escaped_title + r".*\[\uU]ser[^\:]*:[^\*\]", GAN_page_text)        if entry_re is not None:            return(entry_re.span(0))        else:            return(None)

@classmethod def get_fields_from_GAN_entry(cls, conn, GAN_text, entry_span): GAN.log(conn, "get_fields_from_GAN_entry", None, "GAN_text is" + GAN_text) entry_text = GAN_text[entry_span[0]:entry_span[1]] GAN.log(conn, "get_fields_from_GAN_entry", None, "entry_text is" + entry_text) fields = {'type': 'GAN'} user_re = re.search(r"[Uu]ser[^:]*:[^|]*\|", entry_text) if user_re == None: user_re = re.search(r"[Uu]ser[^:]*:[^]]*\]", entry_text) if user_re is not None: user_span = user_re.span(0) user_text = entry_text[user_span[0]:user_span[1]] GAN.log(conn, "get_fields_from_GAN_entry", None, "user_text is" + user_text) colon_pos = user_text.find(":") user_name = user_text[colon_pos+1:-1] GAN.log(conn, "get_fields_from_GAN_entry", None, "user_name is <" + user_name + ">") fields['nominator'] = user_name.strip ts_re = re.search(r"\d\d:\d\d, \d+ [a-zA-z]* \d\d\d\d", entry_text) if ts_re is not None: ts_span = ts_re.span(0) nomination_ts_text = entry_text[ts_span[0]:ts_span[1]] nomination_ts = parse(nomination_ts_text) fields['nomination_ts'] = nomination_ts header_list = re.findall("==[^=]+==", GAN_text[:entry_span[0]]) if len(header_list) > 0: header = header_list[-1:][0] header = header.replace("=","").strip fields['subtopic'] = header return(fields) @classmethod def get_nominee_list_from_revision(cls, rev, talkp): # In use 2023-02-25 nominee_templates = GAH.get_template_list_from_text(rev['slots']['main']['*'], r"") nominees = [] for nominee_template in nominee_templates: try: # Don't bother to analyze badly formed templates; just skip them nominee = GAnominee(nominee_template, rev.revid, talkp) except GA_history_Exception as e:               print("Badly formed nominee template found for " + talkp.page.title + " in rev " + str(rev.revid)) continue nominees.append(nominee) return(nominees)

class GARlink:

def __init__(self, template_text, revid, talkp): # In use 2023-02-25 # An instance of a GAnominee template in a talk page revision. self.date = None self.page_num = None self.GAR_page_num = None self.status = None self.pagename = None self.shortdesc = None

self.talkp = talkp self.talkp_revid = revid self.text = template_text

param_dict = {} if not isinstance(template_text,str): raise GA_history_Exception if len(template_text) < 1: raise GA_history_Exception param_list = GAH.get_piped_elements(template_text) if re.match("gar/link",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a GAR/link template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: if param_list[1].find("=") == -1: # If there's no equal sign in the first argument, it should be a timestamp try: self.date = parse(param_list[1].replace("(UTC)","").strip) except ValueError as e:                   pass else: pv = GAH.get_param_value_pair(param_list[1]) param_dict[pv[0]] = pv[1] for element in param_list[2:]: # get the remaining arguments, if any, into the dictionary pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "page": self.page_num = param_dict[k] elif k == "GARpage": self.GAR_page_num = param_dict[k] elif k == "status": self.status = param_dict[k] elif k == "pagename": self.pagename = param_dict[k] elif k == "shortdesc": self.shortdesc = param_dict[k]

class GAnominee:

def __init__(self, template_text, revid, talkp): # In use 2023-02-25 # An instance of a GAnominee template in a talk page revision. self.date = None self.nominator_string = None self.nominator = None self.page_num = None self.status = None self.subtopic = None self.note = None self.shortdesc = None self.time = None self.article_revid = None

self.talkp = talkp self.talkp_revid = revid self.text = template_text

param_dict = {} if not isinstance(template_text,str): raise GA_history_Exception if len(template_text) < 1: raise GA_history_Exception param_list = GAH.get_piped_elements(template_text) if re.match(r"ga\s?nominee",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a GAnominee template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: if param_list[1].find("=") == -1: # If there's no equal sign in the first argument, it should be a timestamp try: self.date = parse(param_list[1].replace("(UTC)","").strip) except ValueError as e:                   pass else: pv = GAH.get_param_value_pair(param_list[1]) param_dict[pv[0]] = pv[1] for element in param_list[2:]: # get the remaining arguments, if any, into the dictionary pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "nominator": self.nominator_string = param_dict[k] user_re = re.search(r"\[\[User( talk:|:)[^\]]+\]\]",self.nominator_string,re.IGNORECASE) if user_re is not None: user_string = self.nominator_string[user_re.span(0)[0]:user_re.span(0)[1]] pipe_pos = user_string.find("|") if pipe_pos >= 0: user_string = user_string[:pipe_pos] self.nominator = user_string[user_string.find(":")+1:] else: self.nominator = user_string[user_string.find(":")+1:-2] elif k == "page": self.page_num = param_dict[k] elif k == "status": self.status = param_dict[k] elif k == "subtopic": self.subtopic = param_dict[k] elif k == "note": self.note = param_dict[k] elif k == "shortdesc": self.shortdesc = param_dict[k] elif k == "time": self.time = param_dict[k]

class GAO:

def __init__(self, template_text, revid, talkp): # In use 2023-02-25 # An instance of a GA template in a talk page revision. self.date = None self.article_revid = None self.page_num = None self.topic = None self.subtopic = None self.small = None

self.talkp = talkp self.talkp_revid = revid self.text = template_text

param_dict = {} if not isinstance(template_text,str): raise GA_history_Exception if len(template_text) < 1: raise GA_history_Exception param_list = GAH.get_piped_elements(template_text) if re.match(r"ga",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a GA template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: for element in param_list[1:]: if element.find("=") == -1: # If there's no equal sign in the first argument, it should be a timestamp try: self.date = parse(element.replace("(UTC)","").strip) except ValueError as e:                       pass else: pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "date": if self.date is None: try: self.date = parse(param_dict[k].replace("(UTC)","").strip) except ValueError as e:                           pass elif k == "oldid": self.article_revid = param_dict[k] elif k == "page": self.page_num = param_dict[k] elif k == "topic": self.topic = param_dict[k] elif k == "subtopic": self.subtopic = param_dict[k] elif k == "small": self.small = param_dict[k] if revid is not None: self.talkp_revid = revid if talkp is not None: self.talkp = talkp

def get_history_fields(self): # In use 2023-02-25 fields = {'type': 'GAN', 'outcome': 'Passed'} if self.subtopic is not None: fields['subtopic'] = self.subtopic if self.topic is not None and self.subtopic is None: fields['subtopic'] = self.topic if self.date is not None: fields['outcome_ts'] = self.date return(fields)

class FailedGA:

def __init__(self, template_text, revid, talkp): # In use 2023-02-25 # An instance of a FailedGA template in a talk page revision. self.date = None self.article_revid = None self.page_num = None self.topic = None self.subtopic = None self.small = None

self.talkp = talkp self.talkp_revid = revid self.text = template_text

param_dict = {} if not isinstance(template_text,str): raise GA_history_Exception if len(template_text) < 1: raise GA_history_Exception param_list = GAH.get_piped_elements(template_text) if re.match(r"failed\s?ga",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a FailedGA template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: if param_list[1].find("=") == -1: # If there's no equal sign in the first argument, it should be a timestamp try: self.date = parse(param_list[1].replace("(UTC)","").strip) except ValueError as e:                   pass else: pv = GAH.get_param_value_pair(param_list[1]) param_dict[pv[0]] = pv[1] for element in param_list[2:]: # get the remaining arguments, if any, into the dictionary pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "date": if self.date is None: try: self.date = parse(param_dict[k].replace("(UTC)","").strip) except ValueError as e:                           pass elif k == "oldid": self.article_revid = param_dict[k] elif k == "page": self.page_num = param_dict[k] elif k == "topic": self.topic = param_dict[k] elif k == "subtopic": self.subtopic = param_dict[k] elif k == "small": self.small = param_dict[k] if revid is not None: self.talkp_revid = revid if talkp is not None: self.talkp = talkp

def get_history_fields(self): # In use 2023-02-25 fields = {'type': 'GAN', 'outcome': 'Failed'} if self.subtopic is not None: fields['subtopic'] = self.subtopic if self.topic is not None and self.subtopic is None: fields['subtopic'] = self.topic if self.date is not None: fields['outcome_ts'] = self.date return(fields)

class DelistedGA:

def __init__(self, template_text, revid, talkp): # In use 2023-02-25 # An instance of a DelistedGA template in a talk page revision. self.date = None self.article_revid = None self.page_num = None self.topic = None self.subtopic = None self.small = None

self.talkp = talkp self.talkp_revid = revid self.text = template_text

param_dict = {} if not isinstance(template_text,str): raise GA_history_Exception if len(template_text) < 1: raise GA_history_Exception param_list = GAH.get_piped_elements(template_text) if re.match(r"delisted\s?ga",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a FailedGA template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: if param_list[1].find("=") == -1: # If there's no equal sign in the first argument, it should be a timestamp try: self.date = parse(param_list[1].replace("(UTC)","").strip) except ValueError as e:                   pass else: pv = GAH.get_param_value_pair(param_list[1]) param_dict[pv[0]] = pv[1] for element in param_list[2:]: # get the remaining arguments, if any, into the dictionary pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "date": if self.timestamp is None: try: self.date = parse(param_dict[k].replace("(UTC)","").strip) except ValueError as e:                           pass elif k == "oldid": self.article_revid = param_dict[k] elif k == "page": self.page_num = param_dict[k] elif k == "topic": self.topic = param_dict[k] elif k == "subtopic": self.subtopic = param_dict[k] elif k == "small": self.small = param_dict[k] if revid is not None: self.talkp_revid = revid if talkp is not None: self.talkp = talkp

def get_history_fields(self): # In use 2023-02-25 fields = {'type': 'GAR', 'outcome': 'Delisted'} if self.subtopic is not None: fields['subtopic'] = self.subtopic if self.topic is not None and self.subtopic is None: fields['subtopic'] = self.topic if self.date is not None: fields['outcome_ts'] = self.date return(fields)

class GA_article_page:

def __init__(self, article_page): # article_page should be a pywikibot Page object of the article page # In use 2023-02-25 self.page = article_page self.page_class = None if not self.page.exists: self.page_class = "No page" else: if GAN.is_redirect(self.page): self.page_class = "Redirect" elif self.page.isDisambig: self.page_class = "Disambiguation" else: self.page_class = "Normal"

class GA_talk_page:

def __init__(self, talk_page): # In use 2023-02-25 # talk_page should be a pywikibot Page object of the talk page self.page = talk_page self.page_class = None if not self.page.exists: self.page_class = "No page" else: if GAN.is_redirect(self.page): self.page_class = "Redirect" elif self.page.isDisambig: self.page_class = "Disambiguation" else: self.page_class = "Normal"

class GA_sub_page:

def __init__(self, sub_page): # In use 2023-02-25 # article_page_title should be a string with the title of the page, not the title of the talk page or GA subpage # page_num can be int or str but will be stored as str self.article_page_title = sub_page._link._text[5:-4] self.page_num = str(sub_page._link._text[-1:]) self.page = sub_page self.reviewer = None self.review_ts = None self.latest_revision_ts = None self.last_non_bot_user = None self.type = None self.outcome = None self.outcome_ts = None self.nominator = None self.nomination_ts = None self.subtopic = None self.last_edited_by_reviewer_ts = None self.last_non_bot_user_ts = None self.comments = None self.review_last_edited_ts = None self.review_last_edited_by_reviewer_ts = None self.review_last_edited_by_reviewer_ts = None if self.page.exists: self.reviewer = self.page.oldest_revision['user'] rp_ts = self.page.oldest_revision['timestamp'] self.review_ts = GAH.convert_timestamp_to_datetime(rp_ts) self.latest_revision_ts = self.page.latest_revision.timestamp self.last_non_bot_user = self.page.lastNonBotUser for rev in self.page.revisions: if rev['user'] == self.last_non_bot_user: self.last_non_bot_user_ts = rev['timestamp'] break for rev in self.page.revisions: if rev['user'] == self.reviewer: self.last_edited_by_reviewer_ts = rev['timestamp'] break self.review_last_edited_ts = GAH.get_latest_non_bot_revision_timestamp(self.page) self.review_last_edited_by_reviewer_ts = GAH.get_latest_reviewer_revision_timestamp(self.page, self.reviewer) self.review_last_edited_by_reviewer_ts = GAH.convert_timestamp_to_datetime(self.review_last_edited_by_reviewer_ts)

def unset_needs_analysis_flag(self, conn): cursor = conn.cursor(pymysql.cursors.DictCursor) sql = "update " + GA_config.strings['historical GA reviews table name'] + " set needs_analysis = 'N' where article_title = '" + self.article_page_title.replace("'","''") + "' and page = " + str(self.page_num) unset_count = cursor.execute(sql) conn.commit

def match_FailedGA(self, failedGA): # In use 2023-02-25 if failedGA.page_num is not None and failedGA.page_num == str(self.page_num): return True if failedGA.date is not None: if abs((self.review_ts - failedGA.date).days) <= 8: return True if abs((self.last_edited_by_reviewer_ts - failedGA.date).days) <= 8: return True if self.review_ts < failedGA.date and self.last_edited_by_reviewer_ts > failedGA.date: return True return False

def match_GAO(self, a_ga): # In use 2023-02-25 if a_ga.page_num is not None and a_ga.page_num == str(self.page_num): return True if a_ga.date is not None: if abs((self.review_ts - a_ga.date).days) <= 8: return True if abs((self.last_edited_by_reviewer_ts - a_ga.date).days) <= 8: return True if self.review_ts < a_ga.date and self.last_edited_by_reviewer_ts > a_ga.date: return True return False

def match_DelistedGA(self, delistedGA): if delistedGA.page_num is not None and delistedGA.page_num == str(self.page_num): return True if delistedGA.date is not None: if abs((self.review_ts - delistedGA.date).days) <= 8: return True if abs((self.last_edited_by_reviewer_ts - delistedGA.date).days) <= 8: return True if self.review_ts < delistedGA.date and self.last_edited_by_reviewer_ts > delistedGA.date: return True return False

def match_GARlink(self, garlink): if garlink.page_num is not None and garlink.page_num == str(self.page_num): return True if garlink.date is not None: if abs((self.review_ts - garlink.date).days) <= 8: return True if abs((self.last_edited_by_reviewer_ts - garlink.date).days) <= 8: return True if self.review_ts < garlink.date and self.last_edited_by_reviewer_ts > garlink.date: return True return False

def update_from_GAnominee(self, GAnominee): # In use 2023-02-25 fields = {} if self.type is None: self.type = 'GAN' fields['type'] = 'GAN' if GAnominee.date is not None and self.nomination_ts is None: self.nomination_ts = GAnominee.date fields['nomination_ts'] = GAnominee.date if GAnominee.nominator is not None and self.nominator is None: self.nominator = GAnominee.nominator fields['nominator'] = GAnominee.nominator if GAnominee.subtopic is not None and self.subtopic is None: self.subtopic = GAnominee.subtopic fields['subtopic'] = GAnominee.subtopic return(fields)

def update_from_GARlink(self, GARlink): # In use 2023-02-25 fields = {} if self.type is None: self.type = 'GAR' fields['type'] = 'GAR' return(fields)

def update_from_Article_History(self, article_history): # In use 2023-02-25 #print("Called update_from_Article_History") #print(article_history.actions) if article_history is None: return(None) fields = {} if self.subtopic is None and article_history.topic is not None: self.subtopic = article_history.topic fields['subtopic'] = article_history.topic typed_actions = [x for x in article_history.actions if 'type' in x.keys] GA_actions = [x for x in typed_actions if x['type'].lower in ['gan','gar']] matched_action = None if len(GA_actions) == 1: matched_action = GA_actions[0] else: for action in GA_actions: # The action matches the subpage if the link matches, or the date is within 8 days of the range of reviewer dates on the review if 'link' in action.keys: # Possibilities are: # a. It matches identically # b. It just gives the trailing subpage string, e.g. "/GA1" # c. It uses, in which case we substitute and test # d. It's a redirect, in which case we check whether the redirect is to our subpage #print("Link is <" + ah_action['link'] + ">") #print("sub page title is <" + sub_page_title + ">") link = action['link'] hash = link.find("#") if hash >=0: link = link[:hash].strip link = link.replace("_"," ") #print(link) #print(self.page.title) if link == self.page.title or link.replace("_"," ") == self.page.title or link == self.page.title[-4:]: # Case a and b                       matched_action = action elif re.match("",link) is not None and link[-4:] == self.page.title[-4:]: # Case c                       matched_action = action else: if link is not None and link != '': target_page = None try: target_page = GAH.get_link_redirect_target(link) except pywikibot.exceptions.InvalidTitleError as e:                               pass if target_page is not None: if target_page.title == self.page.title: # Case d                                   matched_action = action if matched_action is None and 'date' in action.keys: if action['date'] is not None and self.review_ts is not None and self.review_last_edited_by_reviewer_ts is not None and self.last_non_bot_user_ts is not None and self.review_last_edited_ts is not None: # Check dates next if abs((self.review_ts - action['date']).days) < 7 \ or abs((self.review_last_edited_by_reviewer_ts - action['date']).days) < 7 \ or abs((self.last_non_bot_user_ts - action['date']).days) < 7 \ or abs((self.review_last_edited_ts - action['date']).days) < 7: matched_action = action if matched_action is None: return(None) else: #print(matched_action) if 'type' in matched_action.keys: fields['type'] = matched_action['type'] if 'date' in matched_action.keys: fields['outcome_ts'] = matched_action['date'] if 'result' in matched_action.keys: fields['outcome'] = matched_action['result'] #print("Returning:" + str(fields)) return(fields)

def reset_attributes(self, conn): # In use 2023-02-25 sql_u = "update " + GA_config.strings['historical GA reviews table name'] + " set type = null, comments = null, outcome = null, outcome_ts = null, nominator = null, nomination_ts = null, subtopic = null" sql_u += " where article_title collate utf8mb4_bin = '" + self.article_page_title.replace("'","''") + "' and page = " + str(self.page_num) cursor_u = conn.cursor(pymysql.cursors.DictCursor) u = cursor_u.execute(sql_u) conn.commit

def check_for_GAR_headers(self, conn, row, has_been_set): # In use 2023-02-25 if not has_been_set['type']: GAR_re = re.search(r"\s*\=\=*\s*GA [rR]eassessment\s*\=\=*",self.page.text) if GAR_re is not None: record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'GAR'}, has_been_set) else: GAR_re = re.search(r"\s*\=\=*\s*GA [sS]weep[s]*\s*\=\=*",self.page.text) if GAR_re is not None: record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'GAR'}, has_been_set)

def check_for_under_review(self, conn, row, has_been_set): # In use 2023-02-25 if not has_been_set['outcome']: sql = "select h.article_title, h.page, h.review_ts from " + GA_config.strings['historical GA reviews table name'] + " h"           sql += " inner join active_nominations n on h.article_title = n.title and h.page = n.page" sql += " where h.article_title collate utf8mb4_bin = '" + self.article_page_title.replace("'","''") + "' and h.outcome is null and h.type = 'GAN' and h.review_ts > '2022-07-01'" #print(sql) cursor = conn.cursor(pymysql.cursors.DictCursor) active = cursor.execute(sql) if active > 0: for row in cursor.fetchall: record_count = GAH.update_historical_GA_reviews(conn, row, {'outcome': 'Under review'}, has_been_set)

def try_article_history_update(self, conn, searchp, has_been_set, row): # In use 2023-02-25 #print("Called try_article_history_update") if not has_been_set['type'] or not has_been_set['outcome'] or not has_been_set['outcome_ts'] or not has_been_set['subtopic']: #print("Calling get template") article_history_template_text = GAN.get_article_history_template(searchp.page.text) if article_history_template_text is not None: try: article_history = Article_History(conn, article_history_template_text, searchp.page.latest_revision_id, searchp) except GA_history_Exception: return(None) fields = self.update_from_Article_History(article_history) record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set)

def try_GA(self, conn, talkp, has_been_set, row): # In use 2023-02-25 if self.type == 'GAR': # No point in checking GAN history if we alread know this is a GAR return(None) if not has_been_set['type'] or not has_been_set['outcome'] or not has_been_set['outcome_ts'] or not has_been_set['subtopic']: GA_templates = GAH.get_template_list_from_text(talkp.page.text, r"") GAs = [] for GA_template in GA_templates: a_ga = GAO(GA_template, talkp.page.latest_revision_id, talkp) GAs.append(a_ga) for a_ga in GAs: if self.match_GAO(a_ga): fields = a_ga.get_history_fields record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set) break

def try_failed_GA_update(self, conn, talkp, has_been_set, row): # In use 2023-02-25 if not has_been_set['type'] or not has_been_set['outcome'] or not has_been_set['outcome_ts']: failed_templates = GAH.get_template_list_from_text(talkp.page.text, r"") failedGAs = [] for failed_template in failed_templates: failedGAs.append(FailedGA(failed_template, talkp.page.latest_revision_id, talkp)) for failedGA in failedGAs: if self.match_FailedGA(failedGA): fields = failedGA.get_history_fields record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set) break

def try_delisted_GA_update(self, conn, talkp, has_been_set, row): # In use 2023-02-25 if not has_been_set['type'] or not has_been_set['outcome'] or not has_been_set['outcome_ts']: delisted_templates = GAH.get_template_list_from_text(talkp.page.text, r"") delistedGAs = [] for delisted_template in delisted_templates: delistedGAs.append(DelistedGA(delisted_template, talkp.page.latest_revision_id, talkp)) for delistedGA in delistedGAs: if self.match_DelistedGA(delistedGA): fields = delistedGA.get_history_fields record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set) break

def try_WBGAN(self, conn, talkp, has_been_set, row, config): # In use 2023-02-25 if self.type == 'GAR': # No point in checking GAN history if we alread know this is a GAR return(None) if not has_been_set['type'] or not has_been_set['outcome'] or not has_been_set['outcome_ts'] or not has_been_set['nominator']: rp_dt = GAH.convert_timestamp_to_datetime(self.review_last_edited_ts) rv_dt = GAH.convert_timestamp_to_datetime(self.review_ts) wbgan_row = WBGAN.get_one(config, self.article_page_title) promotion_date = WBGAN.get_promotion_date(config, self.article_page_title, rp_dt) if wbgan_row is not None: promotion_datetime = GAH.convert_timestamp_to_datetime(wbgan_row['promotion_date']) nominator = wbgan_row['nominator'] sql = "select count(*) as later_GANs from " + GA_config.strings['historical GA reviews table name'] \ + " where article_title collate utf8mb4_bin = '" + self.article_page_title.replace("'","''") + "' and page > " + self.page_num \ + " and (type is null or type = 'GAN')" cursor = conn.cursor(pymysql.cursors.DictCursor) GAN_count_row = cursor.execute(sql) count_row = cursor.fetchone GAN_count = count_row['later_GANs'] if GAN_count == 0 or abs((promotion_datetime - rp_dt).days) <= 8 or ((promotion_datetime - rv_dt).days < 10 and (promotion_datetime - rv_dt).days >= 0): record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'GAN', 'outcome': 'Pass','outcome_ts': promotion_datetime, 'nominator': nominator}, has_been_set)

def try_GAN_page_revisions(self, conn, talkp, has_been_set, row): # In use 2023-02-25 if self.type == 'GAR': # No point in checking GAN history if we already know this is a GAR return(None) if not has_been_set['type'] or not has_been_set['nominator'] or not has_been_set['nomination_ts'] or not has_been_set['subtopic']: titles = [self.article_page_title] # Get the titleparts string from the subpage text tp_re = re.search(r"#titleparts:[^|]+\/",self.page.text) if tp_re is not None: tp_title_matched = self.page.text[tp_re.span(0)[0]:tp_re.span(0)[1]] tp_title = tp_title_matched[12:-1] if tp_title != self.article_page_title: titles.append(tp_title) site = pywikibot.Site("en","wikipedia") GAN_page = pywikibot.Page(site, "Wikipedia:Good article nominations") GAN_revs = GAN_page.revisions(total=5,starttime=self.review_ts + datetime.timedelta(0,600), content=True) entry_found = False for GAN_rev in GAN_revs: if entry_found: break for try_title in titles: r_try_title = r"\|\s*" + try_title + r"\s*\[\|\}]" r_try_title = try_title if entry_found: break GAN_text = GAN_rev['slots']['main']['*'] GAN_text = GAH.strip_comments(GAN_text) GAN_entry_span = GAH.get_GAN_entry_span(GAN_text, r_try_title) #GAN.log(conn,"try_GAN_page_revisions",talkp.page.title, "GAN_text is " + GAN_text) if GAN_entry_span is not None: #print("GAN_entry_span = " + str(GAN_entry_span)) #print(GAN_rev) fields = GAH.get_fields_from_GAN_entry(conn, GAN_text, GAN_entry_span) record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set) entry_found = True

def try_talk_page_revisions(self, conn, tdelta, searchp, has_been_set, row): # In use 2023-02-25 if (self.article_page_title[:22] == "Battle of Aleppo (2012" and self.article_page_title[23:] == "2016)") or self.article_page_title == 'Bernie Sanders' or self.article_page_title == 'Joe Biden':  # Hardcoded exception as the page history is too big. return(None) revs = None if searchp.page.exists: revs = searchp.page.revisions(total=500,starttime=self.review_ts + tdelta, content=True) if revs is not None: self.try_nominees_in_revisions(conn, searchp, has_been_set, row, revs) if self.type is None: # If we exited the loop and found no nominee template, record that record_count = GAH.update_historical_GA_reviews(conn, row, {'comments':'No template'}, has_been_set) else: record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'null','comments':'No revisions'}, has_been_set)

def try_nominees_in_revisions(self, conn, talkp, has_been_set, row, revs): # In use 2023-02-25 previous_rev = None nominee_templates_found = False #print("In try_nominees_in_revisions, review_ts = " + str(self.review_ts) + "; page num = " + str(self.page_num)) for rev in revs: rev_ts = datetime.datetime.strptime(str(rev['timestamp']),"%Y-%m-%dT%H:%M:%SZ") if(self.review_ts > rev_ts): #print("In try_nominees_in_revisions, found old enough rev " + str(rev_ts)) if '*' not in rev['slots']['main'].keys: # Skip deleted revisions continue nominees = GAH.get_nominee_list_from_revision(rev, talkp) if len(nominees) > 0: nominee_templates_found = True if len(nominees) == 0 and nominee_templates_found == True: # We found templates in a rev already, but now there are none so they were added in that rev, not this if self.nominator is None or self.nomination_ts is None: # Here we know we're looking at the revision just before the nomination template was added, so the previous_rev has the nominator and nomination_ts record_count = GAH.update_historical_GA_reviews(conn, row, {'nominator': previous_rev['user'], 'nomination_ts': previous_rev.timestamp}, has_been_set) break else: for nominee in nominees: #print("Inside nominee loop, nominee page num is " + str(nominee.page_num)) if self.page_num == nominee.page_num: fields = self.update_from_GAnominee(nominee) #print("In page if, fields are " + str(fields)) record_count = GAH.update_historical_GA_reviews(conn, row, fields, has_been_set) if self.type is not None and self.nominator is not None and self.nomination_ts is not None: # Quit the loop if we have everything we need break previous_rev = rev

def assess_state(self, conn, row, has_been_set, talkp, searchp, articlep, subp): if self.page.exists: if GAN.is_redirect(self.page): # Here we mark the page as a redirect; the actual review information will be in the review target page which we will find elsewhere in the main loop record_count = GAH.update_historical_GA_reviews(conn, row, {'type':'Redirect'}, has_been_set) return(True) (continue_flag, searchp) = GAH.evaluate_page_classes(articlep, talkp, subp, conn, row, has_been_set) return(continue_flag) else: record_count = GAH.update_historical_GA_reviews(conn, row, {'type': 'Deleted'}, has_been_set) return(True)

class Article_History:

def __init__(self, conn, template_text, revid, talkp): # In use 2023-02-25 # An instance of an Article history template in a talk page revision. if template_text is None: self.text = None else: self.text = template_text.replace("\n","") self.text = GAH.strip_comments(self.text) self.talkp = talkp self.talkp_revid = revid self.currentstatus = None self.topic = None self.action_number_to_list_index = {} self.actions = []

param_list = GAH.get_piped_elements(self.text) if re.match(r"article\s?history",param_list[0],re.IGNORECASE) is None: # The first argument isn't one of the possible names for a FailedGA template raise GA_history_Exception elif len(param_list) < 2: # We require at least one of the arguments raise GA_history_Exception else: param_dict = {} for element in param_list[1:]: pv = GAH.get_param_value_pair(element) param_dict[pv[0]] = pv[1] for k in param_dict.keys: if k == "topic": self.topic = param_dict[k] elif k == "currentstatus": self.currentstatus = param_dict[k] elif k[:6] == "action": if k[6].isnumeric: action_number = int(k[6]) if len(k) > 7: if k[7].isnumeric: action_number = int(k[6:8]) else: GAN.log(conn,"Article_History",talkp.page.title, "Found action parameter in article history template with a non-numeric action number") continue # Can't save this one if the action number isn't numeric list_index = None if action_number not in self.action_number_to_list_index.keys: # If we haven't seen this action_number before list_index = len(self.actions) self.action_number_to_list_index[action_number] = list_index self.actions.append({}) else: list_index = self.action_number_to_list_index[action_number] action_param = 'type' if len(k) > 7 and action_number <= 9: action_param = k[7:] if len(k) > 8 and action_number > 9: action_param = k[8:] if action_param == 'date': try: self.actions[list_index]['date'] = parse(param_dict[k].replace("(UTC)","").strip).replace(tzinfo=None) except ValueError as e:                           pass else: self.actions[list_index][action_param] = param_dict[k]

class GA_history_Exception(Exception): # In use 2023-02-25 pass