Skip to content

Commit

Permalink
hackarounds for low mem stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Feb 19, 2024
1 parent 2238ec5 commit d01ce30
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 11 deletions.
30 changes: 27 additions & 3 deletions scripts/unimorph/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def giella2unimorph(tags):
unimorphtags += ['N']
elif giella == 'Det':
unimorphtags += ['DET']
elif giella == 'Part':
unimorphtags += ['PART']
elif giella == 'Pcle':
unimorphtags += ['PART']
elif giella == 'Adv':
Expand All @@ -39,6 +41,8 @@ def giella2unimorph(tags):
unimorphtags += ['ADJ']
elif giella == 'Adj':
unimorphtags += ['ADJ']
elif giella == 'Intj':
unimorphtags += ['INTJ']
elif giella == 'Interj':
unimorphtags += ['INTJ']
elif giella == 'CC':
Expand Down Expand Up @@ -141,6 +145,8 @@ def giella2unimorph(tags):
unimorphtags += ['IND'] # XXX: can sometimes be indef?
elif giella == 'Prs':
unimorphtags += ['PRS']
elif giella == 'Past':
unimorphtags += ['PST']
elif giella == 'Prt':
unimorphtags += ['PST']
elif giella == 'Prt1':
Expand Down Expand Up @@ -330,6 +336,8 @@ def giella2unimorph(tags):
unimorphtags += ['OBGLIG']
elif giella == 'Interr':
unimorphtags += ['INT'] # XXX: ABE?
elif giella in ['Der1', 'Der2']:
continue
elif giella == 'Der/Comp':
unimorphtags += ['CMPR']
elif giella == 'Comp':
Expand Down Expand Up @@ -375,6 +383,8 @@ def giella2unimorph(tags):
continue
elif giella == 'ABBR':
continue
elif giella == 'ACRO':
continue
elif giella == 'ACR':
continue
elif giella == 'LEFT':
Expand All @@ -397,12 +407,14 @@ def giella2unimorph(tags):
continue
elif giella == 'Ord':
continue
elif giella in ['v1', 'v2', 'v3', 'v4', 'v5']:
elif giella in ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']:
continue
elif giella in ['G3', 'G7']:
continue
elif giella.startswith('Sem'):
continue
elif giella == 'Dummytag':
continue
elif giella == 'S':
continue
elif giella == 'Quote':
Expand Down Expand Up @@ -448,10 +460,14 @@ def giella2unimorph(tags):
unimorphtags += ['TR']
elif giella == 'Impers':
unimorphtags += ['IMPRS']
elif giella == 'Reflex':
unimorphtags += ['REFL']
elif giella == 'Refl':
unimorphtags += ['REFL']
elif giella == 'Recipr':
unimorphtags += ['RECP']
elif giella == 'Distr':
unimorphtags += ['REM']
elif giella == 'Dist':
unimorphtags += ['REM']
elif giella == 'Prox':
Expand All @@ -462,6 +478,8 @@ def giella2unimorph(tags):
elif giella == 'AssocColl':
# myv
continue
elif giella in ['0,0', '0,1']:
continue
elif giella in ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9',
'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07',
'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F08',
Expand All @@ -480,7 +498,7 @@ def giella2unimorph(tags):
'B', 'C', 'E', 'D', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'W', 'X',
'Y', 'Z',
'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü']:
'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü', '0']:
# est
continue
elif giella == 'Adn':
Expand Down Expand Up @@ -540,13 +558,19 @@ def giella2unimorph(tags):
elif giella.startswith('AErr/'):
print('SOmething broken here½!', tags)
unimorphtags += ['TYPO']
elif '<cnjcoo>' in giella:
print('SOmething broken here½!', tags)
elif '<actv>' in giella:
print('SOmething broken here½!', tags)
elif '<gen>' in giella:
print('SOmething broken here½!', tags)
elif 'N224-1-9' in giella:
print('SOmething broken here½!', tags)
elif '#222-5-19' in giella:
print('SOmething broken here½!', tags)
elif '/-' in giella:
print('SOmething broken here½!', tags)
elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä']:
elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä', 'ö']:
print('SOmething broken here½!', tags)
elif giella in ['Ne', 'Ni', 'Nte', 'Ntee', 'Nt', 'Nti', 'Na', 'No',
'N-', 'c']:
Expand Down
17 changes: 9 additions & 8 deletions scripts/unimorph/generate-alphabets.bash
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@ if test ! -f "$generator" ; then
echo "Could not find generator automaton $generator"
exit 1
fi
echo "$cyclicRE +UglyHack | [? - [ $cyclicRE ] ]* ;" |
sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.regex
hfst-regexp2fst -i generative.regex -o generative.hfst -f foma
hfst-compose -F -1 generative.hfst -2 "$generator" |\
hfst-fst2fst -f olw -o generator.hfst
for c in a b c d e f g h i j k l m n o p q r s t u v x y z å ä ö š ž ; do
hfst-fst2strings -c 0 generator.hfst -p $c
done > generated.alpha
uniq < generated.alpha | "$(dirname "$0")"/convert.py
echo "$cyclicRE +UglyHack | $c [? - [ $cyclicRE ] ]* ;" |
sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.$c.regex
hfst-regexp2fst -i generative.$c.regex -o generative.$c.hfst -f foma
hfst-compose -F -1 generative.$c.hfst -2 "$generator" |\
hfst-fst2fst -f olw -o generator.$c.hfst
hfst-fst2strings -c 0 generator.$c.hfst > generated.$c
echo $c
uniq < generated.$c | "$(dirname "$0")"/convert.py
done

0 comments on commit d01ce30

Please sign in to comment.