<?php
while($line=fgets(STDIN)) {
if(preg_match('/^\s*<title>(.*)<\/title>\s*$/', $line, $matches)) {
$title = strtr($matches[1], ' ', '_');
} elseif($buffer) {
if(preg_match('/(.*)<\/text>\s*$/', $line, $matches)) {
process_page($title, $buffer . ' ' . $matches[1]);
$buffer = null;
} else {
$buffer .= ' ' . rtrim($line);
}
} elseif(preg_match('/^\s*<text[^>]*>(.*)<\/text>\s*$/', $line, $matches)) {
process_page($title, rtrim($matches[1]));
} elseif(preg_match('/^\s*<text[^>]*>(.*)/', $line, $matches)) {
$buffer = ' ' . rtrim($matches[1]);
}
}
function process_page($title, $body) {
print $title;
while(preg_match('/(.*)\{\{([^\{\}]*)\}\}(.*)/', $body, $matches)) {
$body = $matches[1] . ' ' . $matches[3];
$dates = tally_dates($matches[2], $dates);
}
pretty_print($dates, 'template');
$dates = null;
while(preg_match('/(.*)\<ref[^\&]*\>(.*?)\<\/ref[^\&]*\>(.*)/i', $body, $matches)) {
$body = $matches[1] . ' ' . $matches[3];
$dates = tally_dates($matches[2], $dates);
}
pretty_print($dates, 'references');
$dates = null;
$dates = tally_dates($body, $dates);
pretty_print($dates);
print "\n";
}
function tally_dates($string, $dates) {
$month_regex = '(january|february|march|april|may|june|july|august|september|october|november|december)';
$regexTrail = '(.*)/iu';
$prxDM = "\[\[(\d{1,2})[ _]{$month_regex}]]";
$prxMD = "\[\[{$month_regex}[ _](\d{1,2})]]";
$prxY = "\[\[(\d{1,4}([ _]BC|))]]";
$prxISO1 = "\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})]]";
$prxISO2 = "\[\[(-?\d{4})-(\d{2})-(\d{2})]]";
$DMY_linked = "/(.*){$prxDM} *,? *{$prxY}{$regexTrail}";
$DMY_raw = "/(.*)(\d{1,2})[ _]{$month_regex} *,? *(\d{1,4}([ _]BC|)){$regexTrail}";
$YDM_linked = "/(.*){$prxY} *,? *{$prxDM}{$regexTrail}";
$YDM_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +(\d{1,2})[ _]{$month_regex}{$regexTrail}";
$MDY_linked = "/(.*){$prxMD} *,? *{$prxY}{$regexTrail}";
$MDY_raw = "/(.*){$month_regex} +(\d{1,2}) *,? +(\d{1,4}([ _]BC|)){$regexTrail}";
$YMD_linked = "/(.*){$prxY} *,? *{$prxMD}{$regexTrail}";
$YMD_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +{$month_regex} +(\d{1,2}){$regexTrail}";
$DM_linked = "/(.*){$prxDM}{$regexTrail}";
$MD_linked = "/(.*){$prxMD}{$regexTrail}";
$ISO1_linked = "/(.*){$prxISO1}{$regexTrail}";
$ISO2_linked = "/(.*){$prxISO2}{$regexTrail}";
$ISO_raw = "/(.*)(-?\d{4})-(\d{2})-(\d{2}){$regexTrail}";
while(preg_match($DMY_linked, $string, $matches)) {
$dates['DMY_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($MDY_linked, $string, $matches)) {
$dates['MDY_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($YDM_linked, $string, $matches)) {
$dates['YDM_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($YMD_linked, $string, $matches)) {
$dates['YMD_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($MD_linked, $string, $matches)) {
$dates['MD_linked']++;
$string = $matches[1] . ' ' . $matches[4];
}
while(preg_match($DM_linked, $string, $matches)) {
$dates['DM_linked']++;
$string = $matches[1] . ' ' . $matches[4];
}
while(preg_match($DMY_raw, $string, $matches)) {
$dates['DMY_raw']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($MDY_raw, $string, $matches)) {
$dates['MDY_raw']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($YDM_raw, $string, $matches)) {
$dates['YDM_raw']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($ISO1_linked, $string, $matches)) {
$dates['ISO1_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($ISO2_linked, $string, $matches)) {
$dates['ISO2_linked']++;
$string = $matches[1] . ' ' . $matches[6];
}
while(preg_match($ISO_raw, $string, $matches)) {
$dates['ISO_raw']++;
$string = $matches[1] . ' ' . $matches[5];
}
return $dates;
}
function pretty_print($dates, $type = null) {
if(!is_array($dates)) {
return;
}
if($type == 'template') {
print ' {';
} elseif($type == 'references') {
print ' <';
} else {
print ' ';
}
foreach($dates as $format => $count) {
print $maybe_comma . $format . ':' . $count;
$maybe_comma = ',';
}
if($type == 'template') {
print '}';
} elseif($type == 'references') {
print '>';
}
}
?>