Wikipedia:WikiProject Wikidemia/Quant/Code/parsexml

setenv WDIR ~tobacman/bulk/data/wiki/dumps setenv WFILE idwiki-20060506-pages-meta-history.xml
 * 1) !/bin/tcsh

date

rm -f $WDIR/headers.raw1 $WDIR/headers.raw2 $WDIR/headers.raw

sed -e '/ /b' -e '//b' -e '/ /b' -e '/ /b' -e '/ /b' -e '//b' -e \ '//b' -e '/ /b' -e d $WDIR/$WFILE > $WDIR/headers.raw1
 * 1) grep the headers

sed 's/^[ \t]*//;s/[ \t]*$//' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
 * 1) delete BOTH leading and trailing whitespace from each line

sed '/ /s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed '/ /s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed '/ /s/T/,/g;/ /s/Z//' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
 * 1) substitute "foo" with "bar" ONLY for lines which contain "baz"


 * 1) if a line begins with an equal sign, append it to the previous line
 * 2) and replace the "=" with a single space
 * 3) sed -e :a -e '$!N;s/\n=/ /;ta' -e 'P;D'
 * 4)     

sed -e :a -e '$\!N;s/\n/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n /,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n/,ip,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n /,name,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n /,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n/,-1,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
 * 1) Put in ,-1, when it's a minor edit.

sed -e :a -e 's/<[^>]*>//g;/ $WDIR/headers.raw rm -f $WDIR/headers.raw1
 * 1) remove most HTML tags (accommodates multiple-line tags)

date

exit