Research talk:Automated classification of draft quality/Work log/2016-09-29

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

Thursday, September 29, 2016[edit]

I just checked and my query to get drafts generated in the last year is still running! Arg! --EpochFail (talk) 14:39, 29 September 2016 (UTC)[reply]


And we just lost connection with the mysql server. So, I broke the query down into monthly chunks. here's what the command for doing that looks like:

echo 'SET @start="201509", @end="201510";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201509.tsv
echo 'SET @start="201510", @end="201511";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201510.tsv
echo 'SET @start="201511", @end="201512";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201511.tsv
echo 'SET @start="201512", @end="201601";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201512.tsv
echo 'SET @start="201601", @end="201602";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201601.tsv
echo 'SET @start="201602", @end="201603";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201602.tsv
echo 'SET @start="201603", @end="201604";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201603.tsv
echo 'SET @start="201604", @end="201605";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201604.tsv
echo 'SET @start="201605", @end="201606";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201605.tsv
echo 'SET @start="201606", @end="201607";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201606.tsv
echo 'SET @start="201607", @end="201608";' | \
	cat - sql/draft_quality.variables.sql | \
	mysql -h analytics-store.eqiad.wmnet -u research enwiki > \
	datasets/enwiki.draft_quality.201607.tsv
cat \
	  datasets/enwiki.draft_quality.201508.tsv \
	  datasets/enwiki.draft_quality.201509.tsv \
	  datasets/enwiki.draft_quality.201510.tsv \
	  datasets/enwiki.draft_quality.201511.tsv \
	  datasets/enwiki.draft_quality.201512.tsv \
	  datasets/enwiki.draft_quality.201601.tsv \
	  datasets/enwiki.draft_quality.201602.tsv \
	  datasets/enwiki.draft_quality.201603.tsv \
	  datasets/enwiki.draft_quality.201604.tsv \
	  datasets/enwiki.draft_quality.201605.tsv \
	  datasets/enwiki.draft_quality.201606.tsv \
	  datasets/enwiki.draft_quality.201607.tsv > \
	datasets/enwiki.draft_quality.201508-201608.tsv

And here's the query with the variables:

SELECT
  page_title,
  rev_id,
  rev_timestamp AS creation_timestamp,
  FALSE AS archived,
  "OK" AS draft_quality 
FROM revision 
INNER JOIN page ON
  rev_page = page_id WHERE
  rev_timestamp BETWEEN @start AND @end AND
  rev_parent_id = 0 AND
  page_namespace = 0 

UNION ALL 

SELECT
  ar_title AS page_title,
  ar_rev_id AS rev_id,
  ar_timestamp AS creation_timestamp,
  True AS archived,
  IF(log_comment REGEXP "WP:CSD#G3\\|", "vandalism",
       IF(log_comment REGEXP "WP:CSD#G10\\|", "attack",
       IF(log_comment REGEXP "WP:CSD#G11\\|", "spam",
       IF(log_comment REGEXP "WP:CSD#A11\\|", "hoax", "OK")))) AS draft_quality 
FROM archive 
LEFT JOIN logging speedy_delete ON
  log_namespace = ar_namespace AND
  log_title = ar_title AND
  log_type = "delete" AND
  log_action = "delete" AND
  log_comment LIKE "[[WP:CSD#%" AND
  log_comment REGEXP "WP:CSD#(G3|G10|G11|A11)\\|" AND
  log_timestamp > ar_timestamp 
WHERE
  ar_timestamp BETWEEN @start AND @end AND
  log_timestamp BETWEEN @start AND @end AND
  ar_parent_id = 0 AND
  ar_namespace = 0;