#!/usr/bin/perl use strict; use warnings; use locale; =head1 NAME Tagger - modul pro POS tagging. =head1 SYNOPSIS $tagger = Tagger->new(@trainingFiles); $word = "a"; print $tagger->getTag($word), "\n"; @words = ("a", "ale"); @tags = $tagger->getTags(@words); =head1 METHODS =head2 PUBLIC =cut package Tagger; =head3 new(@files) = Tagger Vytvori novy tagger a soubory @files pouzije k natrenovani znacek. =cut sub new { my $self = {}; bless $self; shift; $self->{words} = (); $self->{tags} = (); $self->{dirty} = 1; for my $f (@_) { $self->addFile($f); } return $self; } =head3 addFile($file) Pouzije soubor $file k natrenovani nejpravdepodobnejsi znacky. =cut sub addFile { my ($self, $file) = @_; open(my $fh, "<$file"); while ( <$fh> ) { chomp; my @p = split(/\t/, $_); if ( $p[1] ) { $p[0] = lc ($p[0]); $self->{words}->{$p[0]}{$p[1]}++; } } } =head3 getTags(@words) = @tags Vraci znacky @tags pro slovo z @words. =cut sub getTags { my $self = shift; if ( $self->{dirty} ) { $self->doComputeTags(); } my @res = (); for my $w (@_) { push(@res, $self->doGetTag($w)); } return @res; } =head3 getTag($word) = $tag Vraci znacku $tag pro slovo $word. =cut sub getTag { my ($self, $w) = @_; if ( $self->{dirty} ) { $self->doComputeTags(); } return $self->doGetTag($w); } =head2 PRIVATE =cut =head3 doComputeTags() Z trenovacich dat spocita nejpravdepodobnejsi znacku. =cut sub doComputeTags { my $self = shift; $self->{dirty} = 0; $self->{tags} = (); my $words = $self->{words}; for my $k ( keys %{$words} ) { my @t = ( sort { $words->{$k}{$b} <=> $words->{$k}{$a} } keys %{$words->{$k}} ); $self->{tags}->{$k} = $t[0]; } } =head3 doGetTag($word) = bool Vraci znacku $tag pro slovo $word. =cut sub doGetTag { my ($self, $w) = @_; $w = lc($w); my $res = $self->{tags}->{$w}; if ( $res ) { return $res; } # print STDERR "dogetTag: $w\n"; if ( $self->isPreposition($w) ) { return "R"; } elsif ( $self->isConjunction($w) ) { return "J"; } elsif ( $self->isAdj($w) ) { return "A"; } elsif ( $self->isNum($w) ) { return "C"; } elsif ( $self->isVerb($w) ) { return "V"; } elsif ( $self->isAdv($w) ) { return "D"; } elsif ( $self->isZTag($w) ) { return "Z"; } else { my %m = (); if ( $m{$w} ) { return $m{$w}; } } return "N"; } =head3 isPreposition($word) = bool Vrati 1, pokud je $word predlozka. Jinak 0. =cut sub isPreposition { my ($self, $w) = @_; my @prep = ("v", "z", "ze", "pøes", "okolo", "pro"); return scalar (grep { $_ eq $w } @prep); } =head3 isConjunction($word) = bool Vrati 1, pokud je $word spojka. Jinak 0. =cut sub isConjunction { my ($self, $w) = @_; my @prep = ("a", "ale", "aby", "i", ); return scalar (grep { $_ eq $w } @prep); } =head3 isAdj($word) = bool Vrati 1, pokud je $word pridavne jmeno. Jinak 0. =cut sub isAdj { my ($self, $w) = @_; if ( $w =~ /[áýíé]$/ ) { return 1; } else { return 0; } } =head3 isNum($word) = bool Vrati 1, pokud je $word cislovka. Jinak 0. =cut sub isNum { my ($self, $w) = @_; return $w =~ /[0-9]+([,.][0-9]*)?/; } =head3 isVerb($word) = bool Vrati 1, pokud je $word sloveso. Jinak 0. =cut sub isVerb { my ($self, $w) = @_; return $w =~ /(il|la|li|ly|el|lo|uje|ujeme|et|it|at)$/; } =head3 isAdv($word) = bool Vrati 1, pokud je $word prislovce. Jinak 0. =cut sub isAdv { my ($self, $w) = @_; if ( $w =~ /(nì)$/ ) { return 1; } else { return 0; } } =head3 isZTag($word) = bool Vrati 1, pokud je $word Z. Jinak 0. =cut sub isZTag { my ($self, $w) = @_; my @z = ("%", ",", ".", "(", ")", ",", ";", "*", "/", "\\", "+", "-", "\"", "?", "!", "|"); return scalar (grep { $_ eq $w } @z); } =head1 COPYRIGHT Copyright 2011 Martin Majlis =cut 1;