- !/usr/bin/perl -w
##########################
# Transformacija osebnih podatkov
##########################
# Razčlenite vnos datuma in razširite vsa datumska polja na 8 polj
# Enako naredite s članki o krajih in imenih/priimkih
##########################
my %months=(
"januar" => 1,
"februar" => 2,
"marec" => 3,
"april" => 4,
"maj" => 5,
"junij" => 6,
"julij" => 7,
"avgust" => 8,
"september" => 9,
"oktober" => 10,
"november" => 11,
"december" => 12
);
my %monthname= ();
foreach $month (keys %months) { # inverse
$monthname{$months{$month}} = $month;
}
my $MONTH = "(" . join("|",keys %months) . ")";
my $YEAR = "(\\d{1,4})";
my $PREFIX = "([^0-9]+)";
# Clean up date entry for standardised persondata date format
sub clean_date {
my $d = shift;
# Put spaces between wikilinks
$d =~ s/([\]])([\[])/$1 $2/g;
# Deal with piped links
$d =~ s/[\[]{2}([^\]]*)[|](.*)[\]]{2}/$2/g;
# Remove square brackets
$d =~ s/[\[\]]//g;
# Deal with templates
$d =~ s/{{([Bb]irth|[Dd]eath) date( and age)?\s?\|\s?(\d{1,4})\|\s?(\d{1,2})\|\s?(\d{1,2})(.*)/$3-$4-$5/g;
# DD.MM.YYYY => DD MMM YYYY
if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{1,4})$/) {
$month = int $2;
$d = "$1 ".$monthname{$month}." $3";
}
# YYYY-MM-DD => DD MMM YYYY
if ($d =~ /^(\d{1,4})-(\d{1,2})-0?(\d{1,2})$/) {
$month = int $2;
$d = "$3 ".$monthname{$month}." $1";
}
# AD/CE is implicit
$d =~ s/A\. ?D\.//; # TODO: doesn't completely work?? e.g. Libanios
$d =~ s/A ?D//;
$d =~ s/\bC\. ?E\.//;
$d =~ s/\bC ?E//;
# Remove trailing punctuation
$d =~ s/(,|\||;|=)$//;
# Remove HTML comments
$d =~ s///;
# Add forgotten spaces
$d =~ s/([a-z])(\d)/$1 $2/g;
$d =~ s/(\d)([a-z])/$1 $2/g;
$d =~ s/\.([^ ])/. $1/g;
#Remove spaces from ordinals
$d =~ s/(\d) (th|st|nd|rd|s\b)/$1$2/g;
# remove double spaces
$d =~ s/\s+/ /g;
# remove spaces at beginning and end
$d =~ s/^\s+//;
$d =~ s/\s+$//;
# Fields with only question marks => "unknown"
$d =~ s/^\?$/unknown/;
# Uniform capitalisation
$d =~ s/Unknown/unknown/g;
$d =~ s/After/after/g;
# question marks should always have brackets and a space before them
$d =~ s/\(\?\)/\?/g;
$d =~ s/([^ ])\?+/$1 \?/g;
$d =~ s/\?/(\?)/g;
# Instead of a question mark at the end, 'probably' at the beginning
$d =~ s/^(.+) \(\?\)$/probably $1/;
$d =~ s/^\(\?\)$/probably/;
# Slash without space
$d =~ s/ ?\/ ?/\//g;
# Remove bolding/italics
$d =~ s/\'{2,5}//g;
# Shortened month names
$d =~ s/Jan[\.\s]+/January /;
$d =~ s/Feb[\.\s]+/February /;
$d =~ s/Mar[\.\s]+/March /;
$d =~ s/Apr[\.\s]+/April /;
$d =~ s/Jun[\.\s]+/June /;
$d =~ s/Jul[\.\s]+/July /;
$d =~ s/Aug[\.\s]+/August /;
$d =~ s/Sep[\.\s]+/September /;
$d =~ s/Sept[\.\s]+/September /;
$d =~ s/Oct[\.\s]+/October /;
$d =~ s/Nov[\.\s]+/November /;
$d =~ s/Dec[\.\s]+/December /;
# Write out "Century"
$d =~ s/C\./century/g;
# "End of the 5th century" => "End 5th century" (simpler)
$d =~ s/ of the//;
$d =~ s/(approx\.?|[Cc]irca\.?\s?|\bca?[\.\s]+|about|around|~)/circa /;
$d =~ s/([Pp]ossibly|[Pp]robably)/probably/g;
$d =~ s/([Ss]till )?[Ll]iving( [Pp]erson)?//;
$d =~ s/[Nn]\/?[Aa]//;
$d =~ s/[Nn]ot [Aa]pplicable//;
# Remove double spaces
$d =~ s/\s+/ /g;
#----------Error fixing---------
# Other typing errors
#$d =~ s/^Um/um/;
#$d =~ s/chr/Chr/;
return $d;
}
sub ilog10 { $x=shift; return int log($x) / log(10); }
my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" );
sub parse_date {
$date = shift;
local %d = ();
if (! $date ) {
$d{"note"} = "";
# Normal date entry
} elsif ($date =~ /^$PREFIX?(\d+ )?$MONTH,? $YEAR( BCE?\.?)?$/) {
$d{"note"} = $1 if defined $1;
$d{"day"} = substr($2,0,-1) if defined $2;
if (defined $3) {
$d{"month"} = $months{$3};
}
$d{"year"} = int $4;
$d{"year"} = -$d{"year"} if defined $5; # B.C.
} elsif ($date =~ /^$PREFIX?$MONTH (\d+)?,?\s?$YEAR( BCE?\.?)?$/) {
$d{"note"} = $1 if defined $1;
$d{"day"} = substr($3,0) if defined $3;
if (defined $2) {
$d{"month"} = $months{$2};
}
$d{"year"} = int $4;
$d{"year"} = -$d{"year"} if defined $5; # B.C.
} elsif ($date =~ /^$PREFIX?$YEAR( BCE?\.?)?$/) {
$d{"note"} = trim($1) if defined $1;
$d{"year"} = int $2;
$d{"year"} = -$d{"year"} if defined $3; # B.C.
# Century
#} elsif ($date =~ /^(around |probably |[Bb]eginning |[Mm]iddle |[Ee]nd )?(\d{1,2})\. [Cc]entury( BCE?\.?)?$/) {
} elsif ($date =~ /^$PREFIX?(\d{1,2})(\.|th|st|nd|rd) [Cc]entury( BCE?\.?)?$/) {
$d{"note"} = trim($1) if defined $1;
$d{"century"} = $2;
$d{"century"} = -$d{"century"} if defined $4;
# Decade
} elsif ($date =~ /^(\d{1,4}) ?'?s( BCE?\.?)?$/) {
$d{"decade"} = (int $1 / 10) * 10;
$d{"decade"} = -$d{"decade"} if defined $3;
$d{"century"} = int ($d{"decade"} / 10) + 1;
} elsif ($date =~ /^$PREFIX?$YEAR or $YEAR$/) { # TODO: or != until, TODO: B.C.
$d{"note"} = trim($1) if defined $1;
$d{"year1"} = int $2;
$d{"year2"} = int $3;
} elsif ($date =~ /^$PREFIX?$YEAR\/(\d{1,4})$/) { # TODO: B.C.
$d{"note"} = trim($1) if defined $1;
$d{"year1"} = int $2;
$d{"year2"} = int $3;
# Example: 1632/33 => 1632/1633
$c = ilog10($d{"year1"}) - ilog10($d{"year2"});
if ( $c < 0 ) { # Errr
$d{"year1"} = "";
$d{"year2"} = "";
$d{"note"} = $date;
} else {
$d{"year2"} = substr($d{"year1"},0,$c) . $d{"year2"};
}
# Interval of several years # TODO: not tested!
} elsif ( $date =~ /^$PREFIX?between $YEAR( and )$YEAR( BCE?\.?)?$/ or
$date =~ /^$PREFIX?$YEAR( to |-)$YEAR$( BCE?\.?)?/ )
{
$d{"note"} = trim($1) if defined $1;
$d{"year1"} = int $2;
$d{"year2"} = int $4;
if (defined $5) {
$tmp = $d{"year1"};
$d{"year1"} = -$d{"year2"};
$d{"year2"} = -$tmp;
}
#} elsif ($date =~ /^$PREFIX?$YEAR( to |-)$YEAR$/) {
# $d{"note"} = trim($1) if defined $1;
# $d{"year1"} = int $2;
# $d{"year2"} = int $4;
} else {
$d{"note"} = $date;
}
if (defined $d{"year1"} and defined $d{"year2"}) {
# if years need to be switched
#if ($d{"year1"} > $d{"year2"}) {
# ($d{"year1"}, $d{"year2"}) = ($d{"year2"}, $d{"year1"});
#}
# if century can be defined
if ( (int $d{"year1"} / 100) eq (int $d{"year2"} / 100)) {
$d{"century"} = (int $d{"year1"} / 100) + 1;
}
# if decade can be defined
if ( $d{"century"} and (int $d{"year1"} / 10) eq (int $d{"year2"} / 10)) {
$d{"decade"} = (int $d{"year1"} / 10) * 10;
}
}
# calculate decade and century
if (defined $d{"year"}) {
$d{"decade"} = (int $d{"year"} / 10) * 10;
}
# calculate century
if (defined $d{"decade"}) {
if( $d{"decade"} != 0 ){
$d{"century"} = (int $d{"decade"} / 100) + (int $d{"decade"} / abs $d{"decade"});
}
else{
if (defined $d{"year"}) {
$d{"century"} = (int $d{"decade"} / 100) + (int $d{"year"} / abs $d{"year"});
}
else{
}
}
}
# Initialise undefined fields
foreach $f (@fields) {
$d{$f} = if not defined $d{$f};
}
return %d;
}
sub parse_location {
$p = trim(shift);
# a or b <- so what is it?!
# ...
# ..., ...
# ... , ...
# .../ ... <- prefereably not like this
if ($p =~ /^\[\[([^\]]+)\]\].*?$/) {
$a = $1;
$a =~ s/\|.*$//;
#print "$p|$a\n";
return $a;
} else {
#print "!$p\n";
return "";
}
}
#For PND number (assigned to German-speaking authors).
#Small number of articles on en wiki have this
sub parse_pnd {
$p = trim(shift);
$pnd_nr = "";
$pnd_date = "";
if ($p =~ /^([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9X])/) {
$check = $9;
$check = 10 if($check eq "X");
# pruefziffer
if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) {
# TODO: Number range 10000000 bis 14999999
if ($1 == "1" && $2>=0 && $2<=4) {
$pnd_nr = "$1$2$3$4$5$6$7$8$9";
}
}
}
if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) {
$pnd_date = "$3-$2-$1";
}
return ($pnd_nr, $pnd_date);
}
sub unbracket {
$p = shift;
# Insert missing spaces between wikilinks
$p =~ s/([\]])([\[])/$1 $2/g;
# remove square brackets
$p =~ s/[\[\]]//g;
return $p;
}
sub trim {
$p = shift;
if ($p) {
# remove empty spaces at beginning and end
$p =~ s/^\s+//;
$p =~ s/\s+$//;
}
return $p;
}
##########################
my @pd = ();
while(<>) {
$line = $_;
@pd = split("\t",$line);
chop($pd[-1]); # remove end-of-line
$pd_transformed[0] = trim($pd[0]); # pd_id
$pd_transformed[1] = trim($pd[1]); # pd_article
$pd_transformed[2] = trim($pd[2]); # pd_name
$pd_transformed[3] = trim($pd[3]); # pd_alternative
$pd_transformed[4] = trim($pd[4]); # pd_description
$pd_transformed[5] = unbracket(trim($pd[5])); # pd_born
$pd_transformed[6] = trim($pd[6]); # pd_born_in
$pd_transformed[7] = unbracket(trim($pd[7])); # pd_died
$pd_transformed[8] = trim($pd[8]); # pd_died_in
$pd_transformed[9] = trim($pd[9]); # pd_pnd
# extract checked pnd-nr and additional date
($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date
if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) {
$pd_transformed[12] = trim($2); # n_given
$pd_transformed[13] = trim($1); # n_surname
$pd_transformed[14] = ; # n_suffix
}
elsif ( trim($pd[2]) =~ /^([^,]+),([^,]+),([^,]+)$/ ) {
$pd_transformed[12] = trim($2); # n_given
$pd_transformed[13] = trim($1); # n_surname
$pd_transformed[14] = trim($3); # n_suffix
}
else {
$pd_transformed[12] = ; # n_given
$pd_transformed[13] = ; # n_surname
$pd_transformed[14] = ; # n_suffix
}
$pd_transformed[15] = parse_location($pd[6]); # b_place
$pd_transformed[16] = parse_location($pd[8]); # d_place
%born = parse_date(clean_date($pd[5]));
$pd_transformed[17] = $born{"day"}; # b_day
$pd_transformed[18] = $born{"month"}; # b_month
$pd_transformed[19] = $born{"year"}; # b_year
$pd_transformed[20] = $born{"decade"}; # b_decade
$pd_transformed[21] = $born{"century"}; # b_century
$pd_transformed[22] = $born{"year1"}; # b_year1
$pd_transformed[23] = $born{"year2"}; # b_year2
$pd_transformed[24] = $born{"note"}; # b_note
%died = parse_date(clean_date($pd[7]));
$pd_transformed[25] = $died{"day"}; # d_day
$pd_transformed[26] = $died{"month"}; # d_month
$pd_transformed[27] = $died{"year"}; # d_year
$pd_transformed[28] = $died{"decade"}; # d_decade
$pd_transformed[29] = $died{"century"}; # d_century
$pd_transformed[30] = $died{"year1"}; # d_year1
$pd_transformed[31] = $died{"year2"}; # d_year2
$pd_transformed[32] = $died{"note"}; # d_note
print join("\t",@pd_transformed) . "\n";
}