From Wikipedia, the free encyclopedia
- Download the split files for en.old from http://download.wikimedia.org. As the database grows there occasionally come to be more split files than there are links. Therefore, after downloading e.g. xaa, xab, xac, xad, xae through the links it also may be necessary to download e.g. xaf and xag.
- Compile wikitrunc.c with gcc
- $ cat xa? | bunzip2 | wikitrunc > trunc_old.sql
- $ mysql --database=test --user=administrator --password=
- At the sql prompt, @trunc_old.sql
- Allow a week or more for a typical pc.
- Run the pre.sql script (14 hours)
- Cut and paste Special:Listadmins to a local file named adminlist.txt.
- Edit the post.sql script to change minimums (if desired) and run it; you get a file called "list"
- Edit the fmtlist.c to change the cutoff date, compile with gcc
- fmtlist <list >list.wiki
- Paste to Wikipedia:Another list of Wikipedians in order of arrival
[edit] wikitrunc.c
#include <stdio.h>
myerror( char *str )
{
fprintf( stderr, "%s\n", str );
exit( 1 );
}
main()
{
static char values[] = "VALUES ";
int state = 0, c, i;
while( state < 7 && ( c = getchar() ) != EOF ) {
putchar( c );
if( values[ state ] == c )
state++;
else
state = 0;
}
while ( c != EOF ) {
do {
// want (
if( ( c = getchar() ) != '(' )
myerror( "format error, expecting (" );
putchar( c );
// want ... , ... , ... ,
for( i=0; i<3; i++ )
copyfield();
truncfield();
while( (c=copyfield()) != ')' && c != EOF )
;
if( c != ')' )
myerror( "expecting )" );
c = getchar(); // ,
putchar( c );
} while( c == ',' );
// ; INSERT INTO old VALUES (
state = i = 0;
while( state < 7 && ( c = getchar() ) != EOF ) {
putchar( c );
if( values[ state ] == c )
state++;
else {
state = 0;
if( i++ > 100 )
error( "garbage looking for INSERT INTO old VALUES" );
}
}
}
// must be some trailing stuff, what to do?
//
//
fprintf( stderr, "last char was %c (%d)\n", c, c );
for( i=100; i--; )
fprintf( stderr, "%c", getchar() );
return 0;
}
copyfield()
{
int c;
if( (c=getchar()) == '\'' ) {
putchar( c );
do {
if( (c = getchar()) == '\\' ) {
putchar( c );
putchar( getchar() );
} else {
putchar( c );
}
} while( c != '\'' );
} else
putchar( c );
while( c != EOF && c != ',' && c != ')' ) {
c = getchar();
putchar( c );
}
return c;
}
truncfield()
{
int c;
int maxcopy = 40;
if( (c=getchar()) == '\'' ) {
putchar( c );
do {
if( (c = getchar()) == '\\' ) {
if( maxcopy > 0 ) {
putchar( c );
putchar( getchar() );
} else
getchar();
} else {
if( maxcopy > 0 || c == '\'' ) {
putchar( c );
maxcopy--;
}
}
} while( c != '\'' );
} else
putchar( c );
while( c != EOF && c != ',' && c != ')' ) {
c = getchar();
putchar( c );
}
return c;
}
[edit] pre.sql
drop table if exists edit_months;
drop table if exists track_users;
create table track_users as select count(old_user_text) as total_edits, old_user_text from old
group by old_user_text having total_edits > 199;
# takes 32 minutes
delete from track_users where old_user_text like '%.%.%.%';
delete from track_users where total_edits < 200;
# leaves about 2500 users of interest
create table edit_months
as select count(floor( old_timestamp/100000000 )) as month_edits,
floor( old_timestamp/100000000 ) as yyyymm,
old.old_user_text as user_text, total_edits
from old, track_users
where old.old_user_text = track_users.old_user_text
group by yyyymm, user_text
having month_edits > 4;
# 14 hours 30 minutes
[edit] post.sh
cat > adminadd.sql <<eof
drop table if exists adminlist;
create table adminlist ( user_text varchar(255) binary not null default '' ) TYPE=InnoDB;
insert into adminlist values
eof
sed -e "s/.*ser:/(\'/" adminlist.txt | sed -e "s/$/'),/" >>adminadd.sql
cat >> adminadd.sql <<eof
('bogus last record')
;
delete from adminlist where user_text = 'bogus last record';
eof
mysql --database=test --user=administrator --password= <<eof
source adminadd.sql
drop table if exists tenure;
create table tenure
as select user_text, min( yyyymm ) as yyyymm_first, max( yyyymm ) as yyyymm_last, count( yyyymm ) as months_active, total_edits
from edit_months
group by user_text, total_edits
order by yyyymm_first, user_text;
alter table tenure add ( admin varchar (10) default 'N' );
update tenure, adminlist set tenure.admin = 'Y' where adminlist.user_text = tenure.user_text;
eof
mysql --database=test --user=administrator --password= >list <<eof
select * from tenure where (total_edits > 1000 and months_active > 5) or admin='Y' order by yyyymm_first, user_text;
eof
rm adminadd.sql
[edit] fmtlist.c
main()
{
static char *months[] = { "", "January", "February", "March", "April", "May",
"June", "July", "August", "September", "October", "November", "December" };
char buf[ 500 ];
gets( buf ); // throw away header line
char name[ 500 ], last[ 10 ], first[ 10 ], active[ 10 ], edits[ 10 ], *p, *q;
int admin;
char savefirst[ 10 ];
savefirst[0] = '\0';
printf( "== About this list ==\n"
"This list was compiled using an [[User:UninvitedCompany/user script|automated script]].\n"
"A [[Wikipedia:List of Wikipedians in order of arrival|manually-maintained list]] that includes "
"some interesting commentary but that does not include all users is also available.\n"
"\n"
"The purpose of this list is to document the contributions of early contributors,"
"to draw attention to those prolific contributors who have left the project so that we may better "
"retain contributors in the future, \n"
"and to serve as a resource for those who wish to better understand social trends at Wikipedia.\n\n"
"The list includes [[Wikipedia:Wikipedians|Wikipedians]] who made more than 1000 edits and were active "
"for more than six months."
"[[Wikipedia:Administrators|Administrators]] are included for comparison even if they do not meet the "
"edit count and acvity criteria.\n"
"Edit counts are shown in parenthesis. Administrators are indicated with a *.\n"
"Wikipedians are listed as inactive when they no longer make at least five edits per month.\n"
"Anonymous contributors are not shown.\n"
"\n"
"The list is based on data as of October 10, 2004. Very recent edits -- those marked (top) in a user's "
"contribution list -- are not included because of technical limitations of the script.\n\n" );
while( gets( buf ) ) {
for( p=buf, q=name; *p && *p != '\t'; p++ )
*q++ = *p;
*q = '\0';
p++;
for( q=first; *p && *p != '\t'; p++ )
*q++ = *p;
*q = '\0';
p++;
for( q=last; *p && *p != '\t'; p++ )
*q++ = *p;
*q = '\0';
p++;
for( q=active; *p && *p != '\t'; p++ )
*q++ = *p;
*q = '\0';
p++;
for( q=edits; *p && *p != '\t'; p++ )
*q++ = *p;
*q = '\0';
p++;
admin = *p == 'Y';
if( strcmp( first, savefirst ) != 0 ) {
printf( "\n== [[%s %.4s]] ==\n",
months[ atoi(first+4) ], first );
strcpy( savefirst, first );
}
if( admin )
printf( "#. [[User:%s|%s*]]", name, name );
else
printf( "#. [[User:%s|%s]]", name, name );
printf( " (%s", edits );
printf( ")" );
if( atoi( last ) < 200408 )
printf( " - inactive since %s %.4s",
months[ atoi(last+4) ], last );
printf( "\n" );
}
}