# This script is released under the GFDL license, see
# http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history
# for a full list of contributors
# Za slovesnko Wikipedijo prevedel in prilagodil Mihael Simonič
### Configuration ###
# Time to sleep between writes (in seconds)
my $write_delay = 5;
# Max pages to download at once
my $download_max = 25;
# Default template
my $default_template = 'Uporabnik:Mihas-bot/kazalo-predloga';
# Cache paths
my $cache = 'cache';
my $wiki_cache = "$cache/wiki";
my $obj_cache = "$cache/obj";
my $report_cache = "$cache/reports";
### End Configuration ###
use strict;
use warnings;
use Data::Dumper;
use Date::Parse;
use Digest::SHA qw(sha1_hex);
use IO::Socket;
use MediaWiki;
use POSIX qw(strftime);
use Storable;
use Time::Duration;
use Time::Local;
use URI::Escape;
use XML::Simple;
my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0);
my $nowiki = 'nowiki';
my $start_time = undef;
#########################################
# Log into Wikipedia #
#########################################
die "Mape za pomnilnik morajo biti ustvarjene pred zagonom bota\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache);
open(PASS,'password'); # A file with only the password, no carraige return
sysread(PASS, my $password, -s(PASS)); # No password in sourcecode.
close(PASS);
writelog ('Povezovanje na Wikipedijo');
my $c = MediaWiki->new;
$c->setup
({
'bot' => {'user' => 'Mihas-bot','pass' => $password},
'wiki' => {'host' => 'sl.wikipedia.org','path' => 'w'}
}) || die 'Napaka v prijavi';
my $whoami = $c->user();
writelog ($whoami.' povezan');
#########################################
# Update cache to modern state #
#########################################
writelog ('Zbiranje nalog');
my @master_job_list = gather_jobs();
my @post_jobs = @master_job_list;
writelog (scalar(@master_job_list).' nalog najdenih');
writelog ('Razčlenjevanje pomnilnika');
my $rh_cache_data = parse_cache();
writelog ('končano');
writelog ('Razčlenjevanje spiska nadzorov');
my $rh_modified_data = parse_watchlist();
writelog ('končano');
download_pages(find_updated());
download_pages(find_templates(@master_job_list));
#push (@needed,&find_holes());
fetch_pages(@master_job_list);
writelog ("Strani $pages_watched so dodane na spisek nadzorov.");
writelog ("$pages_downloaded od $pages_attempted prenešenih.");
#########################################
# Parse cached data and create reports #
#########################################
writelog ('Creating reports');
foreach my $ra_job (@post_jobs) {
my $page = $ra_job->{'page'};
my $dest = $ra_job->{'target'};
my $dest_escaped = _escape($dest);
my $mask = join(', ', @{$ra_job->{'mask'}});
my $index_here = $ra_job->{'indexhere'};
unless (check_text(1000,$dest))
{
writelog ('Preklic pisanja na [['.$dest.']] zaradi neustreznih dovoljenj (vir: '.$page.')');
next;
}
my $report = create_report($ra_job);
open(REPORT, ">$report_cache/$dest_escaped.$$");
use bytes;
print REPORT $report;
close(REPORT);
if (-e "$report_cache/$dest_escaped") {
my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`;
unless ($result) {
writelog ('No change, skipping [['.$dest.']]');
unlink "$report_cache/$dest_escaped.$$";
next;
}
}
writelog ('Pisanje poročila na [['.$dest.']]');
my $edit_summary = "Pisanje kazala arhivov na zahtevo: $mask - \[\[$page\]\] - urejanje bota";
send_report($dest,$report,$edit_summary);
rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped";
}
$ul += 120 + length($log_file);
writelog ('Končano, prenešeno {{formatnum:'.int($dl/1024).'}} kilobajtov in naloženo {{formatnum:'.int($ul/1024).'}}
kilobajtov (približna ocena)');
&post_log();
exit;
#########################################
# Subroutines #
#########################################
sub check_text {
my $bytes = shift;
my $page = shift;
my $host = 'sl.wikipedia.org';
my $path = _underscore("/w/index.php?title=$page&action=raw");
my $sock = new IO::Socket::INET
(
PeerAddr => $host,
PeerPort => 80,
Proto => 'tcp',
);
return 0 unless ($sock);
#TO_DO!
my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: Mihas-bot 0.1'."\r\n\r\n");
syswrite ($sock, $header);
my($buf , $content, $done);
while (!$done)
{
($done = 1) unless sysread($sock, $buf, $bytes);
$content .= $buf;
if ((length($content) >= $bytes) || ($content =~ m|!-- Mihas-bot lahko povozi to stran --|))
{
$done = 1;
}
}
close($sock);
$dl += length($content);
return ($content =~ m|!-- Mihas-bot lahko povozi to stran --|);
}
sub create_report {
my ($ra_job) = @_;
my ($rh_index, $numbered_links) = index_headings($ra_job);
my $template = get_template($ra_job->{'template'});
my $report = sprintf("%sTo kazalo je bilo avtomatsko genreirano na zahtevo na strani [[%s]]. Pokriva naslednje arhive: '''%s'''\n<br/>Poročilo je generiral [[Uporabnik:Smihael|Miha]] ob 17:14, 10. januar 2009 (CET)\n----\n\n",
$template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}}));
$report .= $template->{'header'};
my $i = 0;
foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}}
(keys(%{$rh_index}))) {
$rh_index->{$key}->{'topic'} =~ s:({{.*?}}|[|!]{2}):<$nowiki>$1</$nowiki>:g;
my $row = $template->{'row'};
if ($template->{'altrow'}) {
unless ($i++ % 2 == 0) {
$row = $template->{'altrow'}
}
}
foreach ('topic','replies','link','first','last','duration',
'firstepoch','lastepoch','durationsecs') {
$row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi;
}
$report .= $row;
}
$report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'});
return $report;
}
sub download_pages {
my (@pages) = @_;
return unless @pages;
my $requests = scalar(@pages);
my (@received_names);
while (@pages) {
my @batch;
while ((scalar(@batch) < 50) && @pages) {
my $item = shift(@pages) || last;
$item = _underscore($item);
push (@batch, $item);
}
$pages_attempted += scalar(@batch);
my $xml_code = $c->special_export(@batch);
$dl += length($xml_code);
my $xml_result = XMLin($xml_code);
next unless ($xml_result->{'page'});
if ($xml_result->{'page'}{'title'}) {
push (@received_names, handle_chunk($xml_result->{'page'}));
} else {
foreach my $key (keys %{$xml_result->{'page'}}) {
push (@received_names, handle_chunk($xml_result->{'page'}->{$key}));
}
}
}
writelog('Prenešene '.scalar(@received_names)." strani na zahtevo $requests");
return (@received_names);
}
sub fetch_pages {
my (@jobs) = @_;
my (@cache_names) = keys(%$rh_cache_data);
foreach my $ra_job (@jobs) {
my @fetch;
if ($ra_job->{'indexhere'}) {
my $page = _underscore($ra_job->{'page'});
push(@fetch, $ra_job->{'page'}) unless (grep(/^\Q$page\E$/, @cache_names));
}
my $fetch_size = 0;
foreach my $mask (@{$ra_job->{'mask'}}) {
if ($mask =~ m|<#>|) {
$fetch_size += 10;
my $pattern = _underscore($mask);
my ($part1, $part2) = split(m|<#>|, $pattern, 2);
$pattern = qr/\Q$part1\E(\d+)/;
$pattern .= qr/\Q$part2\E/ if $part2;
my $leading_zeros = $ra_job->{'leading_zeros'}+1;
my $marker = '%d';
$marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1);
my $printf_pattern = $mask;
$printf_pattern =~ s|<#>|$marker|;
my(@mask_pages) = grep(/$pattern/,@cache_names);
my $largest = 0;
foreach my $key (@mask_pages) {
($key =~ m|$pattern|) || next;
$largest = $1 if ($1 > $largest);
}
my $count = $largest;
my (@pages);
until ($count >= ($largest + $fetch_size)) {
$count++;
my $page_name = sprintf($printf_pattern, $count);
push(@fetch,$page_name);
}
# MONTHLY: elsif here for the <date> or whatever is used
} else {
my $check = _underscore($mask);
push (@fetch, $mask) unless (grep(/^\Q$check\E$/, @cache_names));
}
} continue {
if (scalar(@fetch)) {
my (@received) = download_pages(@fetch);
$rh_cache_data = parse_cache();
(@cache_names) = keys(%$rh_cache_data);
if (scalar(@fetch) == scalar(@received)) {
@fetch = ();
redo;
} else {
@fetch = ();
}
}
$fetch_size = 0;
}
}
}
sub find_holes # This sub will find gaps in the archive(mabye a page was deleted then restored) and
{ # adds them to the list of potentially needed pages
return();
}
sub find_templates {
my (@jobs) = @_;
my %templates;
my @templates_needed;
foreach my $ra_job (@jobs) {
$templates{$ra_job->{'template'}}++;
}
foreach my $template (keys %templates) {
$template = $default_template if $template eq 'default';
my $tmpl_under = _underscore($template);
push(@templates_needed, $template) unless grep(/^\Q$tmpl_under\E$/, keys %{$rh_cache_data});
}
writelog (scalar(@templates_needed).' templates needed');
return @templates_needed;
}
sub find_updated # Find items that have changed
{
my(@need_update);
foreach my $page (keys(%{$rh_cache_data})) {
if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't
# been modified in the past month, ignore
if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) {
push(@need_update,$page);
my $fname = ("$wiki_cache/".uri_escape($page).' '.$rh_cache_data->{$page});
unlink($fname); # Remove old item
}
}
}
writelog (scalar(@need_update).' pages need updating');
return @need_update;
}
sub gather_jobs
{
my (@jobs);
my $html_list = $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:Mihas-bot/OptIn&limit=5000")->content();
$dl += length($html_list);
my @targets;
while ($html_list =~ s|>([^<]*?)</a> \(transclusion\)||)
{
push(@targets,$1);
}
my $xml_source = XMLin($c->special_export(@targets));
my $xml = $xml_source;
$dl += length($xml_source);
my $rh_pages = ${$xml}{'page'};
my %targets;
foreach my $key (keys(%{$rh_pages})) {
my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'};
if ($content =~ m"\Q{{User:Mihas-bot/OptIn\E\s*\|(.+?)\s*\Q}}\E"s) {
my @params = split(/\s*\|\s*/, $1);
my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 );
foreach my $param (@params) {
my ($key, $value) = split(/\s*=\s*/, $param);
next unless ($key && defined($value));
$value =~ s:^\.?/:$job{'page'}/:;
if ($key eq 'target') {
$job{'target'} = $value;
} elsif ($key eq 'mask') {
next unless $value;
push (@{$job{'mask'}}, $value);
} elsif ($key =~ /leading_zeroe?s/) {
if ($value =~ m/^(\d+)$/) {
$job{'leading_zeros'} = $1;
}
} elsif ($key eq 'indexhere') {
$job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0'));
} elsif ($key eq 'template') {
$job{'template'} = $value;
}
}
$job{'template'} = 'default' unless $job{'template'};
$job{'template'} = 'default' if $job{'template'} eq 'template location';
next unless ($job{'target'} && $job{'mask'});
if ($targets{$job{'target'}}) {
writelog("Na zahtevo $job{'page'} podvajam cilj $job{'target'}; preklic");
next;
} else {
$targets{$job{'target'}}++;
}
push(@jobs,\%job);
}
}
return @jobs;
}
sub get_template {
my ($template) = (@_);
if ($template eq 'default') {
$template = $default_template;
}
my $tmpl_fn = _escape($template);
my ($file) = glob("$wiki_cache/$tmpl_fn*");
unless ($file) {
if ($template eq $default_template) {
die "$template missing from cache\n";
} else {
return get_template('default');
}
}
open(TMPL, $file);
my @content = <TMPL>;
close(TMPL);
my %template = (lead => '', header => '', row => '', altrow => '',
footer => '', tail => '');
my $section = '';
foreach my $line (@content) {
chomp $line;
if ($line =~ m:^<!--\s*(.*?)\s*-->$:) {
$section = lc($1);
$section =~ s/\s+//g;
last if $section eq 'end';
} else {
if ($section) {
next unless $line;
$template{$section} .= "$line\n";
}
}
}
$template{'lead'} .= "\n" if $template{'lead'};
unless ($template{'row'}) {
die "Predlogi manjka 'row' parameter!\n" if $template eq $default_template;
writelog("Pokvarjena predloga: '$template', uporabljam privzeto predlogo");
return get_template('default');
}
return \%template;
}
sub handle_chunk {
my $chunk = shift;
my $name = _underscore(${$chunk}{'title'});
my $fname = "$wiki_cache/".uri_escape($name);
${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|;
my $time = timegm($6,$5,$4,$3,$2-1,$1);
watch($name) unless (${$rh_cache_data}{$name});
open(OUT,">$fname $time");
binmode(OUT);
use bytes;
print OUT (${$chunk}{'revision'}{'text'}{'content'});
no bytes;
close(OUT);
$pages_downloaded++;
return $name;
}
sub index_headings {
my ($ra_job) = @_;
my $mask_re = '';
foreach my $mask (@{$ra_job->{'mask'}}) {
my $mask2 = _escape($mask);
if ($mask2 =~ m|%3C%23%3E|) {
my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2);
$mask_re .= '(?:';
$mask_re .= qr/\Q$part1\E\d+/;
$mask_re .= qr/\Q$part2\E/ if $part2;
$mask_re .= ')|';
# MONTHLY: elsif here for <date>
} else {
$mask_re .= qr/\Q$mask2\E/.'|';
}
}
chop($mask_re);
opendir(CACHE,$wiki_cache);
my(@cache) = readdir(CACHE);
closedir(CACHE);
my @files = grep(m|^(?:$mask_re)|,@cache);
if ($ra_job->{'indexhere'}) {
my $page = _escape($ra_job->{'page'});
push(@files, grep(m|^\Q$page\E \d+$|,@cache));
}
my (%index, %used_headings);
my $numbered_links = 0;
foreach my $file (@files) {
my (%used_names);
next unless ($file =~ m|^(.*) (\d+)$|);
my $root_path = uri_unescape($1);
my $display_name = $root_path;
$display_name =~ s/_/ /g;
open(WIKI, "$wiki_cache/$file");
my @content = <WIKI>;
close(WIKI);
my $prev_heading = '';
my ($comment_count,$first,$last) = (0,0,0);
foreach my $line (@content) {
if ($line =~ m|^==\s*([^=].+?)\s*==|) {
if ($prev_heading && $comment_count > 0) {
## WARNING: This code is duplicated below vvvvvv
$index{$prev_heading}->{'replies'} = $comment_count;
if ($first && $last) {
$index{$prev_heading}->{'firstepoch'} = $first;
$index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first));
$index{$prev_heading}->{'lastepoch'} = $last;
$index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
$index{$prev_heading}->{'durationsecs'} = $last - $first;
if ($comment_count > 1) {
$index{$prev_heading}->{'duration'} = duration($last - $first);
} else {
$index{$prev_heading}->{'duration'} = 'Brez';
}
}
$comment_count = 0;
$first = 0;
$last = 0;
}
my $heading = $1;
my $head_link;
($head_link, $numbered_links) = path_fix($heading, $numbered_links);
$used_names{$head_link}++;
my $suffix = (($used_names{$head_link} > 1) ? ('_'.$used_names{$head_link}) : (''));
$used_headings{$head_link.$suffix}++;
$prev_heading = $head_link.$suffix.'_'.$used_headings{$head_link.$suffix};
$index{$prev_heading} = { topic => $heading, link =>
("[[{{urlencode:$root_path}}#$head_link".$suffix."|$display_name]]"),
root_path => $root_path, head_link => $head_link,
replies => 'Neznano', first => 'Neznano',
'last' => 'Neznano', duration => 'Neznano',
firstepoch => 0, lastepoch => 0,
durationsecs => 0,
};
} elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) {
$comment_count++;
my $time = str2time($1);
if ($time && (!$first || $time < $first)) {
$first = $time;
}
if ($time && ($time > $last)) {
$last = $time;
}
}
}
if ($prev_heading && $comment_count > 0) {
## WARNING: This code is duplicated from above ^^^^^^
$index{$prev_heading}->{'replies'} = $comment_count;
if ($first && $last) {
$index{$prev_heading}->{'firstepoch'} = $first;
$index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first));
$index{$prev_heading}->{'lastepoch'} = $last;
$index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
$index{$prev_heading}->{'durationsecs'} = $last - $first;
if ($comment_count > 1) {
$index{$prev_heading}->{'duration'} = duration($last - $first);
} else {
$index{$prev_heading}->{'duration'} = 'None';
}
}
}
}
return \%index;
}
sub parse_cache
{
my (@pages,$count);
opendir(CACHE,$wiki_cache);
my(@files) = readdir(CACHE);
closedir(CACHE);
my(%cache);
foreach my $file (@files)
{
next unless ($file =~ m|^(.*) (\d+)$|);
my $page_name = uri_unescape($1);
my $time = $2;
$cache{$page_name} = $time;
}
return \%cache;
}
sub parse_watchlist
{
my $watchlist = $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content();
$dl += length($watchlist);
my @lines = split("\n",$watchlist);
my @date;
my %watchlist;
while (scalar(@lines))
{
my $line = shift(@lines);
if ($line =~ m|<h4>(\d{4})-(\d{2})-(\d{2})</h4>|i)
{
@date = ($1,$2,$3);
}
if ($line =~ m|title="([^"]*?)">hist</a>|i) # "
{
my $page_name = _underscore($1);
$line =~ m|(\d{2}):(\d{2}):(\d{2})|;
$watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]);
}
}
return \%watchlist;
}
sub path_fix {
my ($path,$numbered_links) = @_;
($path =~ s|'{2,4}||g);
($path =~ s|<.*?>||g);
($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g);
($path =~ s|\[\[:?(.*?)\]\]|$1|g);
while ($path =~ m|\[.*?\]|) {
my $title;
if ($path =~ m|\[[^ ]* (.*?)\]|) {
$title = $1;
} else {
$numbered_links++;
$title = ".5B$numbered_links.5D";
}
$path =~ s|\[.*?\]|$title|;
}
($path =~ s|\s|_|g);
($path =~ s| |.C2.A0|g);
while ($path =~ m|([^/a-z0-9\.:_'-])|i) {
my $bad = $1;
my $fix = uc('.'.sprintf("%x",ord($bad)));
($path =~ s/\Q$bad/$fix/g);
}
return ($path,$numbered_links);
}
sub post_log {
my $pg = $c->get('User:Mihas-bot/Indeksiranje/dnevnik', 'rw');
$pg->{summary} = ('Pisanje dnevniške dtaoteke '.$start_time).' - urejanje bota';
$pg->{content} = $log_file;
$pg->save();
}
sub send_report {
my $dest = shift;
my $report = shift;
my $edit_summary = shift;
my $pg = $c->get($dest, 'w');
$pg->{summary} = $edit_summary;
$pg->{content} = '<!-- Mihas-bot lahko povozi to stran -->'."\n".$report;
$ul += length($report);
$pg->save() || writelog("Shranjevanje poročila na $dest ni uspelo");
sleep($write_delay);
}
sub watch
{
my $page_name = shift;
my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success;
$pages_watched++ if ($success);
return $success;
}
sub writelog {
my $entry = shift;
my @month_table =
(
'januar',
'februar',
'marec',
'april',
'maj',
'junij',
'julij',
'avgust',
'september',
'oktober',
'november',
'december',
);
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time);
my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900));
$start_time ||= $time;
$log_file .= ('* '.$time.': '.$entry.' [[Uporabnik:Smihael|Miha]]'."\n");
warn "$entry\n";
}
sub _escape {
my ($val) = @_;
$val = _underscore($val);
$val = uri_escape($val);
return $val;
}
sub _hash {
my ($val) = @_;
$val = _escape($val);
$val = sha1_hex($val);
return $val;
}
sub _underscore {
my ($val) = @_;
$val =~ s|\s|_|g;
return $val;
}