#!/usr/bin/perl
use strict;
use warnings;
use locale;

=head1 NAME

Tagger - modul pro POS tagging.

=head1 SYNOPSIS

	$tagger = Tagger->new(@trainingFiles);
	$word = "a";
	print $tagger->getTag($word), "\n";
	@words = ("a", "ale");
	@tags = $tagger->getTags(@words);

=head1 METHODS

=head2 PUBLIC

=cut

package Tagger;


=head3 new(@files) = Tagger

Vytvori novy tagger a soubory @files pouzije k natrenovani znacek.

=cut
sub new
{
	my $self = {};
	bless $self;
	shift;
	$self->{words} = ();
	$self->{tags} = ();
	$self->{dirty} = 1;

	for my $f (@_) { 
		$self->addFile($f);
	}

	return $self;
}

=head3 addFile($file)

Pouzije soubor $file k natrenovani nejpravdepodobnejsi znacky.

=cut
sub addFile
{
	my ($self, $file) = @_;
	open(my $fh, "<$file");
	while ( <$fh> ) { 
		chomp;
		my @p = split(/\t/, $_);
		if ( $p[1] ) {
			$p[0] = lc ($p[0]);
			$self->{words}->{$p[0]}{$p[1]}++;
		}	
	}
}

=head3 getTags(@words) = @tags

Vraci znacky @tags pro slovo z @words.

=cut
sub getTags
{
	my $self = shift;

	if ( $self->{dirty} ) { 
		$self->doComputeTags();
	}
	
	my @res = ();
	for my $w (@_) { 
		push(@res, $self->doGetTag($w));
	}

	return @res;
}

=head3 getTag($word) = $tag

Vraci znacku $tag pro slovo $word.

=cut
sub getTag
{
	my ($self, $w) = @_;

	if ( $self->{dirty} ) { 
		$self->doComputeTags();
	}
	
	return $self->doGetTag($w);

}

=head2 PRIVATE

=cut

=head3 doComputeTags()

Z trenovacich dat spocita nejpravdepodobnejsi znacku.

=cut
sub doComputeTags
{
	my $self = shift;

	$self->{dirty} = 0;

	$self->{tags} = ();
	my $words = $self->{words};
	
	for my $k ( keys %{$words} ) { 
		my @t = ( sort { $words->{$k}{$b} <=> $words->{$k}{$a} } 
					keys %{$words->{$k}} );

		$self->{tags}->{$k} = $t[0];
	}
}


=head3 doGetTag($word) = bool

Vraci znacku $tag pro slovo $word.

=cut
sub doGetTag
{
	my ($self, $w) = @_;
	$w = lc($w);

	my $res = $self->{tags}->{$w};
	if ( $res ) { 
		return $res;
	}
	
#	print STDERR "dogetTag: $w\n";
	if ( $self->isPreposition($w) ) { 
		return "R";
	} elsif ( $self->isConjunction($w) ) { 
		return "J";
	} elsif ( $self->isAdj($w) ) { 
		return "A";		
	} elsif ( $self->isNum($w) ) { 
		return "C";
	} elsif ( $self->isVerb($w) ) { 
		return "V";
	} elsif ( $self->isAdv($w) ) { 
		return "D";	
	} elsif ( $self->isZTag($w) ) { 
		return "Z";	
	} else {
		my %m = ();
		if ( $m{$w} ) { 
			return $m{$w};
		}	
	}
	
	return "N";

}


=head3 isPreposition($word) = bool

Vrati 1, pokud je $word predlozka. Jinak 0.

=cut
sub isPreposition
{
	my ($self, $w) = @_;
	my @prep = ("v", "z", "ze", "pøes", "okolo", "pro");
	return scalar (grep { $_ eq $w } @prep);
}

=head3 isConjunction($word) = bool

Vrati 1, pokud je $word spojka. Jinak 0.

=cut
sub isConjunction
{
	my ($self, $w) = @_;
	my @prep = ("a", "ale", "aby", "i", );
	return scalar (grep { $_ eq $w } @prep);
}

=head3 isAdj($word) = bool

Vrati 1, pokud je $word pridavne jmeno. Jinak 0.

=cut
sub isAdj
{
	my ($self, $w) = @_;
	if ( $w =~ /[áũíé]$/ ) { 
		return 1;
	} else { 
		return 0;
	}
}

=head3 isNum($word) = bool

Vrati 1, pokud je $word cislovka. Jinak 0.

=cut
sub isNum
{
	my ($self, $w) = @_;
	return $w =~ /[0-9]+([,.][0-9]*)?/;
}

=head3 isVerb($word) = bool

Vrati 1, pokud je $word sloveso. Jinak 0.

=cut
sub isVerb
{
	my ($self, $w) = @_;
	return $w =~ /(il|la|li|ly|el|lo|uje|ujeme|et|it|at)$/;
}

=head3 isAdv($word) = bool

Vrati 1, pokud je $word prislovce. Jinak 0.

=cut
sub isAdv
{
	my ($self, $w) = @_;
	if ( $w =~ /(nė)$/ ) { 
		return 1;
	} else { 
		return 0;
	}
}

=head3 isZTag($word) = bool

Vrati 1, pokud je $word Z. Jinak 0.

=cut
sub isZTag
{
	my ($self, $w) = @_;
	my @z = ("%", ",", ".", "(", ")", ",", ";", "*", "/", "\\", "+", "-", "\"", "?", "!", "|");
	return scalar (grep { $_ eq $w } @z);
}


=head1 COPYRIGHT

Copyright 2011 Martin Majlis

=cut


1;