Uporabnik:Mihas-bot/Indeksiranje/izvorna koda

    Iz Wikipedije, proste enciklopedije
    # This script is released under the GFDL license, see
    # http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history
    # for a full list of contributors
    # Za slovesnko Wikipedijo prevedel in prilagodil Mihael Simonič
    
    ### Configuration ###
    # Time to sleep between writes (in seconds)
    my $write_delay = 5;
    # Max pages to download at once
    my $download_max = 25;
    
    # Default template
    my $default_template = 'Uporabnik:Mihas-bot/kazalo-predloga';
    
    # Cache paths
    my $cache = 'cache';
    my $wiki_cache = "$cache/wiki";
    my $obj_cache = "$cache/obj";
    my $report_cache = "$cache/reports";
    ### End Configuration ###
    
    use strict;
    use warnings;
    use Data::Dumper;
    use Date::Parse;
    use Digest::SHA qw(sha1_hex);
    use IO::Socket;
    use MediaWiki;
    use POSIX qw(strftime);
    use Storable;
    use Time::Duration;
    use Time::Local;
    use URI::Escape;
    use XML::Simple;
    
    
    my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0);
    my $nowiki = 'nowiki';
    my $start_time = undef;
    #########################################
    # Log into Wikipedia                    #
    #########################################
    die "Mape za pomnilnik morajo biti ustvarjene pred zagonom bota\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache);
    open(PASS,'password');                  # A file with only the password, no carraige return
    sysread(PASS, my $password, -s(PASS));  # No password in sourcecode.
    close(PASS);
    writelog                  ('Povezovanje na Wikipedijo');
    my $c                  =   MediaWiki->new;
    $c->setup
                            ({
                              'bot' => {'user' => 'Mihas-bot','pass' => $password},
                              'wiki' => {'host' => 'sl.wikipedia.org','path' => 'w'}
                            }) || die 'Napaka v prijavi';
    my $whoami              =  $c->user();
    writelog                 ($whoami.' povezan');
    
    #########################################
    # Update cache to modern state          #
    #########################################
    writelog                 ('Zbiranje nalog');
    my @master_job_list     =  gather_jobs();
    my @post_jobs           = @master_job_list;
    
    
    writelog                  (scalar(@master_job_list).' nalog najdenih');
    writelog                 ('Razčlenjevanje pomnilnika');
    my $rh_cache_data       =  parse_cache();
    writelog                 ('končano');
    writelog                 ('Razčlenjevanje spiska nadzorov');
    my $rh_modified_data    =  parse_watchlist();
    writelog                 ('končano');
    download_pages(find_updated());
    download_pages(find_templates(@master_job_list));
    #push                     (@needed,&find_holes());
                               fetch_pages(@master_job_list);
    writelog                 ("Strani $pages_watched so dodane na spisek nadzorov.");
    writelog                 ("$pages_downloaded od $pages_attempted prenešenih.");
    #########################################
    # Parse cached data and create reports  #
    #########################################
    
    writelog ('Creating reports');
    foreach my $ra_job (@post_jobs) {
      my $page = $ra_job->{'page'};
      my $dest = $ra_job->{'target'};
      my $dest_escaped = _escape($dest);
      my $mask = join(', ', @{$ra_job->{'mask'}});
      my $index_here = $ra_job->{'indexhere'};
      unless (check_text(1000,$dest))
        {
        writelog ('Preklic pisanja na [['.$dest.']] zaradi neustreznih dovoljenj (vir: '.$page.')');
        next;
        }
      my $report = create_report($ra_job);
      open(REPORT, ">$report_cache/$dest_escaped.$$");
      use bytes;
      print REPORT $report;
      close(REPORT);
      if (-e "$report_cache/$dest_escaped") {
        my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`;
        unless ($result) {
          writelog ('No change, skipping [['.$dest.']]');
          unlink "$report_cache/$dest_escaped.$$";
          next;
        }
      }
      writelog ('Pisanje poročila na [['.$dest.']]');
      my $edit_summary = "Pisanje kazala arhivov na zahtevo: $mask - \[\[$page\]\] - urejanje bota";
      send_report($dest,$report,$edit_summary);
      rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped";
    }
    
    $ul += 120 + length($log_file);
    writelog ('Končano, prenešeno {{formatnum:'.int($dl/1024).'}} kilobajtov in naloženo {{formatnum:'.int($ul/1024).'}} 
    kilobajtov (približna ocena)');
    &post_log();
    exit;
    
    #########################################
    # Subroutines                           #
    #########################################
    
    sub check_text {
      my $bytes = shift;
      my $page = shift;
    
      my $host = 'sl.wikipedia.org';
      my $path = _underscore("/w/index.php?title=$page&action=raw");
      my $sock         = new IO::Socket::INET
          (
           PeerAddr    => $host,
           PeerPort    => 80,
           Proto       => 'tcp',
          );
      return 0 unless ($sock);
    #TO_DO!
      my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: Mihas-bot 0.1'."\r\n\r\n");
      syswrite ($sock, $header);
      my($buf , $content, $done);
      while (!$done)
        {
        ($done = 1) unless sysread($sock, $buf, $bytes);
        $content .= $buf;
        if ((length($content) >= $bytes) || ($content =~ m|!-- Mihas-bot lahko povozi to stran --|))
          {
          $done = 1;
          }
        }
      close($sock);
      $dl += length($content);
      return ($content =~ m|!-- Mihas-bot lahko povozi to stran --|);
    }
    
    sub create_report {
      my ($ra_job) = @_;
      my ($rh_index, $numbered_links) = index_headings($ra_job);
      my $template = get_template($ra_job->{'template'});
      my $report = sprintf("%sTo kazalo je bilo avtomatsko genreirano na zahtevo na strani [[%s]]. Pokriva naslednje arhive: '''%s'''\n<br/>Poročilo je generiral [[Uporabnik:Smihael|Miha]] ob 17:14, 10. januar 2009 (CET)\n----\n\n",
                           $template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}}));
      $report .= $template->{'header'};
      my $i = 0;
      foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}} 
    (keys(%{$rh_index}))) {
        $rh_index->{$key}->{'topic'} =~ s:({{.*?}}|[|!]{2}):<$nowiki>$1</$nowiki>:g;
        my $row = $template->{'row'};
        if ($template->{'altrow'}) {
          unless ($i++ % 2 == 0) {
            $row = $template->{'altrow'}
          }
        }
        foreach ('topic','replies','link','first','last','duration',
                 'firstepoch','lastepoch','durationsecs') {
          $row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi;
        }
        $report .= $row;
      }
      $report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'});
      return $report;
    }
    
    sub download_pages {
      my (@pages) = @_;
      return unless @pages;
      my $requests = scalar(@pages);
      
      my (@received_names);
      
      while (@pages) {
        my @batch;
        while ((scalar(@batch) < 50) && @pages) {
          my $item = shift(@pages) || last;
          $item = _underscore($item);
          push (@batch, $item);
        }
        $pages_attempted += scalar(@batch);
        my $xml_code = $c->special_export(@batch);
        $dl += length($xml_code);
        my $xml_result = XMLin($xml_code);
        next unless ($xml_result->{'page'});
        if ($xml_result->{'page'}{'title'}) {
          push (@received_names, handle_chunk($xml_result->{'page'}));
        } else {
          foreach my $key (keys %{$xml_result->{'page'}}) {
            push (@received_names, handle_chunk($xml_result->{'page'}->{$key}));
          }
        }
      }
      writelog('Prenešene '.scalar(@received_names)." strani na zahtevo $requests");
      return (@received_names);
    }
    
    sub fetch_pages {
      my (@jobs) = @_;
      
      my (@cache_names) = keys(%$rh_cache_data);
      foreach my $ra_job (@jobs) {
        my @fetch;
        
        if ($ra_job->{'indexhere'}) {
          my $page = _underscore($ra_job->{'page'});
          push(@fetch, $ra_job->{'page'}) unless (grep(/^\Q$page\E$/, @cache_names));
        }
        
        my $fetch_size = 0;
        foreach my $mask (@{$ra_job->{'mask'}}) {
          if ($mask =~ m|<#>|) {
            $fetch_size += 10;
            my $pattern = _underscore($mask);
            my ($part1, $part2) = split(m|<#>|, $pattern, 2);
            $pattern = qr/\Q$part1\E(\d+)/;
            $pattern .= qr/\Q$part2\E/ if $part2;
            my $leading_zeros = $ra_job->{'leading_zeros'}+1;
            my $marker = '%d';
            $marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1);
            my $printf_pattern = $mask;
            $printf_pattern =~ s|<#>|$marker|;
            my(@mask_pages) = grep(/$pattern/,@cache_names);
            my $largest = 0;
            foreach my $key (@mask_pages) {
              ($key =~ m|$pattern|) || next;
              $largest = $1 if ($1 > $largest);
            }
            my $count = $largest;
            my (@pages);
            until ($count >= ($largest + $fetch_size)) {
              $count++;
              my $page_name = sprintf($printf_pattern, $count);
              push(@fetch,$page_name);
            }
          # MONTHLY: elsif here for the <date> or whatever is used
          } else {
            my $check = _underscore($mask);
            push (@fetch, $mask) unless (grep(/^\Q$check\E$/, @cache_names));
          }
        } continue {
          if (scalar(@fetch)) {
            my (@received) = download_pages(@fetch);
            $rh_cache_data = parse_cache();
            (@cache_names) = keys(%$rh_cache_data);
            if (scalar(@fetch) == scalar(@received)) {
              @fetch = ();
              redo;
            } else {
              @fetch = ();
            }
          }
          $fetch_size = 0;
        }
      }
    }
    
    sub find_holes  # This sub will find gaps in the archive(mabye a page was deleted then restored) and
      {             # adds them to the list of potentially needed pages
      return();
      }
    
    sub find_templates {
      my (@jobs) = @_;
      my %templates;
      my @templates_needed;
      foreach my $ra_job (@jobs) {
        $templates{$ra_job->{'template'}}++;
      }
      foreach my $template (keys %templates) {
        $template = $default_template if $template eq 'default';
        my $tmpl_under = _underscore($template);
        push(@templates_needed, $template) unless grep(/^\Q$tmpl_under\E$/, keys %{$rh_cache_data});
      }
      writelog (scalar(@templates_needed).' templates needed');
      return @templates_needed;
    }
    
    sub find_updated # Find items that have changed
      {
      my(@need_update);
      foreach my $page (keys(%{$rh_cache_data})) {
        if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't
                                          # been modified in the past month, ignore
          if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) {
            push(@need_update,$page);
            my $fname = ("$wiki_cache/".uri_escape($page).' '.$rh_cache_data->{$page});
            unlink($fname); # Remove old item
          }
        }
      }
      writelog (scalar(@need_update).' pages need updating');
      return @need_update;
      }
    
    sub gather_jobs
      {
      my (@jobs);
      my $html_list         =  $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:Mihas-bot/OptIn&limit=5000")->content();
      $dl += length($html_list);
      my @targets;
      while ($html_list =~ s|>([^<]*?)</a> \(transclusion\)||)
        {
        push(@targets,$1);
        }
      my $xml_source = XMLin($c->special_export(@targets));
      my $xml = $xml_source;
      $dl += length($xml_source);
      my $rh_pages = ${$xml}{'page'};
      my %targets;
      foreach my $key (keys(%{$rh_pages})) {
        my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'};
        if ($content =~ m"\Q{{User:Mihas-bot/OptIn\E\s*\|(.+?)\s*\Q}}\E"s) {
          my @params = split(/\s*\|\s*/, $1);
          my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 );
          foreach my $param (@params) {
            my ($key, $value) = split(/\s*=\s*/, $param);
            next unless ($key && defined($value));
            
            $value =~ s:^\.?/:$job{'page'}/:;
            
            if ($key eq 'target') {
              $job{'target'} = $value;
            } elsif ($key eq 'mask') {
              next unless $value;
              push (@{$job{'mask'}}, $value);
            } elsif ($key =~ /leading_zeroe?s/) {
              if ($value =~ m/^(\d+)$/) {
                $job{'leading_zeros'} = $1;
              }
            } elsif ($key eq 'indexhere') {
              $job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0'));
            } elsif ($key eq 'template') {
              $job{'template'} = $value;
            }
            
          }
          $job{'template'} = 'default' unless $job{'template'};
          $job{'template'} = 'default' if $job{'template'} eq 'template location';
          
          next unless ($job{'target'} && $job{'mask'});
          
          if ($targets{$job{'target'}}) {
            writelog("Na zahtevo $job{'page'} podvajam cilj $job{'target'}; preklic");
            next;
          } else {
            $targets{$job{'target'}}++;
          }
          
          push(@jobs,\%job);
        }
      }
      return @jobs;
      }
    
    sub get_template {
      my ($template) = (@_);
      
      if ($template eq 'default') {
        $template = $default_template;
      }
      
      my $tmpl_fn = _escape($template);
      my ($file) = glob("$wiki_cache/$tmpl_fn*");
      unless ($file) {
        if ($template eq $default_template) {
          die "$template missing from cache\n";
        } else {
          return get_template('default');
        }
      }
      open(TMPL, $file);
      my @content = <TMPL>;
      close(TMPL);
    
      my %template = (lead => '', header => '', row => '', altrow => '', 
                      footer => '', tail => '');
      my $section = '';
      foreach my $line (@content) {
        chomp $line;
        if ($line =~ m:^<!--\s*(.*?)\s*-->$:) {
          $section = lc($1);
          $section =~ s/\s+//g;
          last if $section eq 'end';
        } else {
          if ($section) {
            next unless $line;
            $template{$section} .= "$line\n";
          }
        }
      }
      $template{'lead'} .= "\n" if $template{'lead'};
      
      unless ($template{'row'}) {
        die "Predlogi manjka 'row' parameter!\n" if $template eq $default_template;
        writelog("Pokvarjena predloga: '$template', uporabljam privzeto predlogo");
        return get_template('default');
      }
      
      return \%template;
    }
    
    sub handle_chunk {
      my $chunk = shift;
      my $name = _underscore(${$chunk}{'title'});
      my $fname = "$wiki_cache/".uri_escape($name);
      ${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|;
      my $time = timegm($6,$5,$4,$3,$2-1,$1);
      watch($name) unless (${$rh_cache_data}{$name});
      open(OUT,">$fname $time");
      binmode(OUT);
      use bytes;
      print OUT (${$chunk}{'revision'}{'text'}{'content'});
      no bytes;
      close(OUT);
      $pages_downloaded++;
      return $name;
    }
        
    sub index_headings {
      my ($ra_job) = @_;
      
      my $mask_re = '';
      foreach my $mask (@{$ra_job->{'mask'}}) {
        my $mask2 = _escape($mask);
        if ($mask2 =~ m|%3C%23%3E|) {
          my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2);
          $mask_re .= '(?:';
          $mask_re .= qr/\Q$part1\E\d+/;
          $mask_re .= qr/\Q$part2\E/ if $part2;
          $mask_re .= ')|';
        # MONTHLY: elsif here for <date>
        } else {
          $mask_re .= qr/\Q$mask2\E/.'|';
        }
      }
      chop($mask_re);
      
      opendir(CACHE,$wiki_cache);
      my(@cache) = readdir(CACHE);
      closedir(CACHE);
      my @files = grep(m|^(?:$mask_re)|,@cache);
      if ($ra_job->{'indexhere'}) {
        my $page = _escape($ra_job->{'page'});
        push(@files, grep(m|^\Q$page\E \d+$|,@cache));
      }
      my (%index, %used_headings);
      my $numbered_links = 0;
      foreach my $file (@files) {
        my (%used_names);
        next unless ($file =~ m|^(.*) (\d+)$|);
        my $root_path = uri_unescape($1);
        my $display_name = $root_path;
        $display_name =~ s/_/ /g;
        open(WIKI, "$wiki_cache/$file");
        my @content = <WIKI>;
        close(WIKI);
        my $prev_heading = '';
        my ($comment_count,$first,$last) = (0,0,0);
        foreach my $line (@content) {
          if ($line =~ m|^==\s*([^=].+?)\s*==|) {
            if ($prev_heading && $comment_count > 0) {
              ## WARNING: This code is duplicated below vvvvvv
              $index{$prev_heading}->{'replies'} = $comment_count;
              if ($first && $last) {
                $index{$prev_heading}->{'firstepoch'} = $first;
                $index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first));
                $index{$prev_heading}->{'lastepoch'} = $last;
                $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
                $index{$prev_heading}->{'durationsecs'} = $last - $first;
                if ($comment_count > 1) {
                  $index{$prev_heading}->{'duration'} = duration($last - $first);
                } else {
                  $index{$prev_heading}->{'duration'} = 'Brez';
                }
              }
              $comment_count = 0;
              $first = 0;
              $last = 0;
            }
            my $heading = $1;
            my $head_link;
            ($head_link, $numbered_links) = path_fix($heading, $numbered_links);
            $used_names{$head_link}++;
            my $suffix = (($used_names{$head_link} > 1) ? ('_'.$used_names{$head_link}) : (''));
            $used_headings{$head_link.$suffix}++;
            $prev_heading = $head_link.$suffix.'_'.$used_headings{$head_link.$suffix};
            $index{$prev_heading} = { topic => $heading, link => 
    ("[[{{urlencode:$root_path}}#$head_link".$suffix."|$display_name]]"),
                                      root_path => $root_path, head_link => $head_link,
                                      replies => 'Neznano', first => 'Neznano',
                                      'last' => 'Neznano', duration => 'Neznano',
                                      firstepoch => 0, lastepoch => 0,
                                      durationsecs => 0,
                                    };
          } elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) {
            $comment_count++;
            my $time = str2time($1);
            if ($time && (!$first || $time < $first)) {
              $first = $time;
            }
            if ($time && ($time > $last)) {
              $last = $time;
            }
          }
        }
        if ($prev_heading && $comment_count > 0) {
          ## WARNING: This code is duplicated from above ^^^^^^
          $index{$prev_heading}->{'replies'} = $comment_count;
          if ($first && $last) {
            $index{$prev_heading}->{'firstepoch'} = $first;
            $index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first));
            $index{$prev_heading}->{'lastepoch'} = $last;
            $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));
            $index{$prev_heading}->{'durationsecs'} = $last - $first;
            if ($comment_count > 1) {
              $index{$prev_heading}->{'duration'} = duration($last - $first);
            } else {
              $index{$prev_heading}->{'duration'} = 'None';
            }
          }
        }
      }
      return \%index;
    }
    
    sub parse_cache
      {
      my (@pages,$count);
      opendir(CACHE,$wiki_cache);
      my(@files) = readdir(CACHE);
      closedir(CACHE);
      my(%cache);
      foreach my $file (@files)
        {
        next unless ($file =~ m|^(.*) (\d+)$|);
        my $page_name = uri_unescape($1);
        my $time = $2;
        $cache{$page_name} = $time;
        }
      return \%cache;
      }
    
    sub parse_watchlist
      {
      my $watchlist         =  $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content();
      $dl += length($watchlist);
      my @lines             =  split("\n",$watchlist);
      my @date;
      my %watchlist;
      while (scalar(@lines))
        {
        my $line = shift(@lines);
        if ($line =~ m|<h4>(\d{4})-(\d{2})-(\d{2})</h4>|i)
          {
          @date = ($1,$2,$3);
          }
        if ($line =~ m|title="([^"]*?)">hist</a>|i) # "
          {
          my $page_name = _underscore($1);
          $line =~ m|(\d{2}):(\d{2}):(\d{2})|;
          $watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]);
          }
        }
      return \%watchlist;
      }
    
    sub path_fix {
      my ($path,$numbered_links) = @_;
      ($path =~ s|'{2,4}||g);
      ($path =~ s|<.*?>||g);
      ($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g);
      ($path =~ s|\[\[:?(.*?)\]\]|$1|g);
      while ($path =~ m|\[.*?\]|) {
        my $title;
        if ($path =~ m|\[[^ ]* (.*?)\]|) {
          $title = $1;
        } else {
          $numbered_links++;
          $title = ".5B$numbered_links.5D";
        }
        $path =~ s|\[.*?\]|$title|;
      }
      ($path =~ s|\s|_|g);
      ($path =~ s| |.C2.A0|g);
      while ($path =~ m|([^/a-z0-9\.:_'-])|i) {
        my $bad = $1;
        my $fix = uc('.'.sprintf("%x",ord($bad)));
        ($path =~ s/\Q$bad/$fix/g);
      }
      return ($path,$numbered_links);
    }
    
    sub post_log {
      my $pg                =  $c->get('User:Mihas-bot/Indeksiranje/dnevnik', 'rw');
      $pg->{summary}        =  ('Pisanje dnevniške dtaoteke '.$start_time).' - urejanje bota';
      $pg->{content}        =  $log_file;
      $pg->save();
    }
    
    sub send_report {
      my $dest      = shift;
      my $report    = shift;
      my $edit_summary = shift;
      my $pg        = $c->get($dest, 'w');
      $pg->{summary}        =  $edit_summary;
      $pg->{content}        =  '<!-- Mihas-bot lahko povozi to stran -->'."\n".$report;
      $ul += length($report);
      $pg->save() || writelog("Shranjevanje poročila na $dest ni uspelo");
      sleep($write_delay);
    }
    
    sub watch
      {
      my $page_name = shift;
      my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success;
      $pages_watched++ if ($success);
      return $success;
      }
    
    sub writelog {
      my $entry = shift;
      my @month_table =
      (
       'januar',
       'februar',
       'marec',
       'april',
       'maj',
       'junij',
       'julij',
       'avgust',
       'september',
       'oktober',
       'november',
       'december',
      );
      my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time);
      my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900));
      $start_time ||= $time;
      $log_file .= ('* '.$time.': '.$entry.' [[Uporabnik:Smihael|Miha]]'."\n");
      warn "$entry\n";
    }
    
    sub _escape {
      my ($val) = @_;
      $val = _underscore($val);
      $val = uri_escape($val);
      return $val;
    }
    
    sub _hash {
      my ($val) = @_;
      $val = _escape($val);
      $val = sha1_hex($val);
      return $val;
    }
    
    sub _underscore {
      my ($val) = @_;
      $val =~ s|\s|_|g;
      return $val;
    }