-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaistrigh
executable file
·152 lines (137 loc) · 3.44 KB
/
aistrigh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
#use Lingua::GA::Gramadoir;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
#my $gr = new Lingua::GA::Gramadoir;
my %lex;
my @rialacha;
sub tolower
{
(my $inp) = @_;
$inp =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÍÓÚ/abcdefghijklmnopqrstuvwxyzáéíóú/;
return $inp;
}
sub toupper
{
(my $inp) = @_;
$inp =~ tr/abcdefghijklmnopqrstuvwxyzàèìòù/ABCDEFGHIJKLMNOPQRSTUVWXYZÀÈÌÒÙ/;
return $inp;
}
# like w, but without the upper/lowercase acrobatics
sub raw_w
{
(my $word, my $num) = @_;
if (exists($lex{$word})) {
my @aistriuchain = split(/;/,$lex{$word});
if ($num == -1) {
if (scalar @aistriuchain > 1) {
# but eventually put disambig code here!
$num = 0;
}
else {
$num = 0;
}
}
if ($num >= 0 and $num < scalar @aistriuchain) {
return $aistriuchain[$num];
}
else {
print STDERR "numerical argument $num to w out of range for $word\n";
}
}
else {
return '';
}
}
# 5 Aug 2005; now also pass <t> markup as part of "word",
# so as to include the disambig info when it's there
sub w
{
(my $word, my $num) = @_;
print STDERR "In w with word=$word, num=$num\n";
if ($num==-1 and $word =~ m/^<t ([^>]+)>/) {
$num = $1;
}
$word =~ s/^<t[^>]*>//;
$word =~ s/<\/t>$//;
print STDERR "In w with word=$word, num=$num\n";
my $ans = raw_w($word,$num);
return $ans if $ans;
my $lower = $word;
$lower =~ s/(>[^<]+<)/tolower($1)/e;
$ans = raw_w($lower,$num);
if ($ans) {
$ans =~ s/^(.)/toupper($1)/e;
return $ans;
}
else {
$word =~ s/<[^>]+>//g;
$word =~ tr/ÁÉÍÓÚáéíóú/ÀÈÌÒÙàèìòù/;
return $word;
}
}
## START OF MAIN
# read in reordering rules
open (RIAIL, "<:utf8", "/usr/local/share/ga2gd/rialacha.txt") or die "Could not open rules file: $!\n";
while (<RIAIL>) {
chomp;
s/#.*$//;
if (/\S/) {
(my $patt, my $repl) = m/^(.*) -> (.*)$/;
# note hacky "[s]" on RHS; prevents subst on next line from taking place!
$patt =~ s/<t>(.+?)<\/t>/<w><t>$1<\/t><[s]><[A-Z][^>]*>[^<]+<\/[A-Z]><\/s><\/w>/g;
$patt =~ s/<s>(.+?)<\/s>/<w><t><[A-Z][^>]*>[^<]+<\/[A-Z]><\/t><s>$1<\/s><\/w>/g;
$patt =~ s/(<t>.+?<\/t>)/($1)/g if ($repl =~ m/\$[1-9]/);
$patt =~ s/<t>/<t[^>]*>/g;
$repl =~ s/^/"/;
$repl =~ s/$/"/;
$repl =~ s/(w\([^)]+\))/".$1."/g;
$repl =~ s/w\(([^),]+)\)/w($1,-1)/g;
# print STDERR "compiling pattern = \"$patt\"\n";
push @rialacha, {'patt' => qr/$patt/, 'repl' => $repl};
}
}
# read in cuardach.txt to lex
open (CUARD, "<:utf8", "/usr/local/share/ga2gd/cuardach.txt") or die "Could not open bilingual lexicon: $!\n";
while (<CUARD>) {
m/^(<[^>]+>[^<]+<\/[A-Z]>) (.*)$/;
my $mykey = $1;
if (exists($lex{$mykey})) {
my @curr = split /;/,$lex{$mykey};
my @toadd = split /;/,$2;
if (scalar @curr != scalar @toadd) {
print STDERR "Alignment problem with $lex{$mykey} and $2\n";
}
else {
$lex{$mykey}='';
for (my $i=0; $i < @curr; $i++) {
$lex{$mykey} .= "$curr[$i]/$toadd[$i];";
}
$lex{$mykey} =~ s/;$//;
}
}
else {
$lex{$1} = $2;
}
}
close CUARD;
# 28 May 2005; now reads output of "stemmer -l" -
# XML format with tagged words followed immediately by
# tagged stem. One sentence per line.
while (<STDIN>) {
# print STDERR "before rules:\n$_\n";
foreach my $rule (@rialacha) {
my $p=$rule->{'patt'};
my $r=$rule->{'repl'};
# print STDERR "applying rule with repl=$r\n";
# my $c = scalar (m/$p/g);
# print STDERR "matches $c times\n";
s/$p/$r/eeg;
}
print;
}
exit 0;