#!/usr/bin/perl
use strict;
use warnings;
use utf8;
do "NGramLangModel.pl";
our @iy = ( "i", "y", "í", "ý", "I", "Y", "Í", "Ý");
sub usage
{
print "correct.pl trainFile N\n";
print "Uses train file to train N-gram language model.";
print "Corrects i/y from STDIN and writes result to STDOUT.\n";
}
sub init
{
my $i = @{$tm} - 1;
while ($i >= 0) {
${$posib}[$i] = getPosib($$tm[$i]);
$i--;
}
}
sub getPosib
{
my @posib = ();
my $ver = 1;
# print "Word: ", $word, "\n";
if ( $word !~ /[íýiyÍÝIY]/) {
my @res = ();
$res[0] = $word;
}
my @chars = split(//, $word);
for my $i ( 0 .. $#chars ) {
my $c = $chars[$i];
if ( $c =~ /[íýiyÍÝIY]/) {
for my $j ( 0 .. $#iy ) {
# iy char
if ($c eq $iy[$j]) {
my $next = 0;
if ($j %2 == 0) {
$next = $j + 1;
} else {
$next = $j - 1;
}
for my $v ( 0 .. ($ver - 1) ) {
$posib[$v][$i] = $c;
for my $tmpI ( 0 .. $i ) {
$posib[$v + $ver][$tmpI] = $posib[$v][$tmpI];
}
$posib[$v + $ver][$i] = $iy[$next];
}
#
print $c, "=>",
$iy[$next], ": ", $posib[0], " vs ", $posib[1], "\n";
$ver *= 2;
last;
}
# else {
#
print $i, ".", $j, ":
not ", $iy[$j], "\n";
# }
}
} else {
for my $v ( 0 .. ($ver - 1) ) {
#
print $v, ":", $i, " = ", $c, "\n";
$posib[$v][$i] = $c;
}
}
}
my @res;
for my $i ( 0 .. ($ver - 1) ) {
$res[$i] = join("", @{$posib[$i]});
# print $i, ": ", $res[$i], "\n";
}
}
if ( $#ARGV < 0) {
usage();
}
my $trainFile = $ARGV[0];
my $heldOutFile = $ARGV[1];
my $n = $ARGV[2];
my $bogusFile = $ARGV[3];
my $model = NGramLangModel->new($n, $trainFile);
$model->computeProbs();
$model->emSmooth($heldOutFile);
my $stream = NGramStream->new($bogusFile, $n);
my @prob = ();
my @used = ();
#inicializace
my $ngram = $stream->next();
my $i = $n - 1;
while ($i >= 0) {
my $tmpN = $n - $i;
# print "tmpN: ", $tmpN, "(", $i, ")\n";
my $j = $tmpN - 1;
while ($j > 0) {
$used[$j] = $used[$j - 1];
# print $j, ": ", $used[$j], "\n";
$j--;
}
if ($$ngram[$i] !~ /[íýiyÍÝIY]/) {
$used[0] = $$ngram[$i];
} else {
my @posib = getPosib($$ngram[$i]);
my $max = 0;
my $maxIndex = 0;
for my $pos ( 0 .. $#posib ) {
$used[0] = $posib[$pos];
my $p = $model->getSmoothedProb(\@used, $tmpN);
if ($p > $max) {
$max = $p;
$maxIndex = $pos;
}
}
$used[0] = $posib[$maxIndex];
# print $used[0], ": ", $max, "\n";
}
$i--;
}
while (my $ngram = $stream->next() ) {
my $i = $n - 1;
while ($i > 0) {
$used[$i] = $used[$i - 1];
$i--;
}
if ($$ngram[0] !~ /[íýiyÍÝIY]/) {
$used[0] = $$ngram[0];
} else {
my @posib = getPosib($$ngram[0]);
my $max = 0;
my $maxIndex = 0;
for my $pos ( 0 .. $#posib ) {
$used[0] = $posib[$pos];
my $p = $model->getSmoothedProb(\@used, $n);
if ($p > $max) {
$max = $p;
$maxIndex = $pos;
}
}
$used[0] = $posib[$maxIndex];
}
}
#if ( $n !~ /\d+/ || $p !~ /0?\.\d+/) {
# usage();
# exit(3);
#}
my $p = 0;
my $act = 0;
my $lines = 0;
while (<STDIN>) {
my @chars = split(//, $_);
for my $i ( 0 .. $#chars ) {
my $c = $chars[$i];
if ( $c =~ /[íýiy]/ && rand() < $p && $act < $n) {
$c =~ tr/íýiy/ýíyi/;
$act++;
}
}
$lines++;
if ( $act == $n ) {
last;
}
}