|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
package Tokens; |
|
|
|
|
|
use strict; |
|
binmode STDIN, ':utf8'; |
|
binmode STDOUT, ':utf8'; |
|
use utf8; |
|
|
|
|
|
|
|
my $pipe = !defined (caller); |
|
|
|
|
|
use File::Basename; |
|
my $abs_path = "."; |
|
$abs_path = dirname(__FILE__); |
|
|
|
|
|
|
|
my $UpperCase = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜ]"; |
|
my $LowerCase = "[a-záéíóúàèìòùâêîôûñçü]"; |
|
my $Punct = qr/[\,\;\«\»\“\”\'\"\&\$\#\=\(\)\<\>\!\¡\?\¿\\\[\]\{\}\|\^\*\€\·\¬\…]/; |
|
my $Punct_urls = qr/[\:\/\~]/; |
|
|
|
|
|
|
|
|
|
|
|
my $contr = "([Hh]e|[Hh]ere|[Hh]ow|[Ii]t|[Ss]he|[Tt]hat|[Tt]here|[Ww]hat|[Ww]hen|[Ww]here|[Ww]ho|[Ww]hy)"; |
|
|
|
my $w = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜa-záéíóúàèìòùâêîôûñçü]"; |
|
|
|
sub tokens { |
|
|
|
my ($sentences) = @_; |
|
|
|
|
|
my $susp = "3SUSP012"; |
|
my $duplo1 = "2DOBR111"; |
|
my $duplo2 = "2DOBR222"; |
|
my $duplo3 = "2DOBR333"; |
|
my $duplo4 = "2DOBR444"; |
|
|
|
|
|
my $dot_quant = "44DOTQUANT77"; |
|
my $comma_quant = "44COMMQUANT77"; |
|
my $quote_quant = "44QUOTQUANT77"; |
|
|
|
|
|
|
|
my @saida = (); |
|
|
|
foreach my $sentence (@{$sentences}) { |
|
|
|
chomp $sentence; |
|
|
|
|
|
$sentence =~ s/[ ]*$//; |
|
$sentence =~ s/\.\.\./ $susp /g ; |
|
$sentence =~ s/\<\</ $duplo1 /g ; |
|
$sentence =~ s/\>\>/ $duplo2 /g ; |
|
$sentence =~ s/\'\'/ $duplo3 /g ; |
|
$sentence =~ s/\`\`/ $duplo4 /g ; |
|
|
|
$sentence =~ s/([0-9]+)\.([0-9]+)/${1}$dot_quant$2 /g ; |
|
$sentence =~ s/([0-9]+)\,([0-9]+)/${1}$comma_quant$2 /g ; |
|
$sentence =~ s/([0-9]+)\'([0-9]+)/${1}$quote_quant$2 /g ; |
|
|
|
|
|
$sentence =~ s/($Punct)/ $1 /g ; |
|
|
|
$sentence =~ s/($Punct_urls)(?:[\s\n]|$)/ $1 /g ; |
|
|
|
|
|
$sentence =~ s/(\w)- /$1 - /g ; |
|
$sentence =~ s/ -(\w)/ - $1/g ; |
|
$sentence =~ s/(\w)-$/$1 -/g ; |
|
$sentence =~ s/^-(\w)/- $1/g ; |
|
|
|
|
|
$sentence =~ s/\.$/ \. /g ; |
|
|
|
my @tokens = split (" ", $sentence); |
|
|
|
foreach my $token (@tokens) { |
|
|
|
$token =~ s/^[\s]*//; |
|
$token =~ s/[\s]*$//; |
|
$token =~ s/$susp/\.\.\./; |
|
$token =~ s/$duplo1/\<\</; |
|
$token =~ s/$duplo2/\>\>/; |
|
$token =~ s/$duplo3/\'\'/; |
|
$token =~ s/$duplo4/\`\`/; |
|
$token =~ s/$dot_quant/\./; |
|
$token =~ s/$comma_quant/\,/; |
|
$token =~ s/$quote_quant/\'/; |
|
|
|
if($pipe){ |
|
print "$token "; |
|
}else{ |
|
push (@saida, $token); |
|
} |
|
} |
|
|
|
if($pipe){ |
|
print "\n"; |
|
}else{ |
|
push (@saida, ""); |
|
} |
|
} |
|
|
|
return \@saida; |
|
} |
|
|
|
|
|
if($pipe){ |
|
my @tokens=<STDIN>; |
|
tokens(\@tokens); |
|
} |
|
|
|
|
|
|
|
|
|
sub punct { |
|
my ($p) = @_ ; |
|
my $result =""; |
|
|
|
if ($p eq "\.") { |
|
$result = "Fp"; |
|
} |
|
elsif ($p eq "\,") { |
|
$result = "Fc"; |
|
} |
|
elsif ($p eq "\:") { |
|
$result = "Fd"; |
|
} |
|
elsif ($p eq "\;") { |
|
$result = "Fx"; |
|
} |
|
elsif ($p =~ /^(\-|\-\-)$/) { |
|
$result = "Fg"; |
|
} |
|
elsif ($p =~ /^(\'|\"|\`\`|\'\')$/) { |
|
$result = "Fe"; |
|
} |
|
elsif ($p eq "\.\.\.") { |
|
$result = "Fs"; |
|
} |
|
elsif ($p =~ /^(\<\<|«)/) { |
|
$result = "Fra"; |
|
} |
|
elsif ($p =~ /^(\>\>|»)/) { |
|
$result = "Frc"; |
|
} |
|
elsif ($p eq "\%") { |
|
$result = "Ft"; |
|
} |
|
elsif ($p =~ /^(\/|\\)$/) { |
|
$result = "Fh"; |
|
} |
|
elsif ($p eq "\(") { |
|
$result = "Fpa"; |
|
} |
|
elsif ($p eq "\)") { |
|
$result = "Fpt"; |
|
} |
|
elsif ($p eq "\¿") { |
|
$result = "Fia"; |
|
} |
|
elsif ($p eq "\?") { |
|
$result = "Fit"; |
|
} |
|
elsif ($p eq "\¡") { |
|
$result = "Faa"; |
|
} |
|
elsif ($p eq "\!") { |
|
$result = "Fat"; |
|
} |
|
elsif ($p eq "\[") { |
|
$result = "Fca"; |
|
} |
|
elsif ($p eq "\]") { |
|
$result = "Fct"; |
|
} |
|
elsif ($p eq "\{") { |
|
$result = "Fla"; |
|
} |
|
elsif ($p eq "\}") { |
|
$result = "Flt"; |
|
} |
|
return $result; |
|
} |
|
|
|
|
|
sub lowercase { |
|
my ($x) = @_ ; |
|
$x = lc ($x); |
|
$x =~ tr/ÁÉÍÓÚÇÑ/áéíóúçñ/; |
|
|
|
return $x; |
|
} |
|
|
|
sub Trim { |
|
my ($x) = @_ ; |
|
|
|
$x =~ s/^[\s]*//; |
|
$x =~ s/[\s]$//; |
|
|
|
return $x; |
|
} |
|
|