From d01ce30c63b7cb89b55b1e68505e8b9df1c4a571 Mon Sep 17 00:00:00 2001 From: Flammie A Pirinen Date: Mon, 19 Feb 2024 17:29:41 +0100 Subject: [PATCH] hackarounds for low mem stuff --- scripts/unimorph/convert.py | 30 +++++++++++++++++++++--- scripts/unimorph/generate-alphabets.bash | 17 +++++++------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/scripts/unimorph/convert.py b/scripts/unimorph/convert.py index 06001fa0..7f1f9861 100755 --- a/scripts/unimorph/convert.py +++ b/scripts/unimorph/convert.py @@ -23,6 +23,8 @@ def giella2unimorph(tags): unimorphtags += ['N'] elif giella == 'Det': unimorphtags += ['DET'] + elif giella == 'Part': + unimorphtags += ['PART'] elif giella == 'Pcle': unimorphtags += ['PART'] elif giella == 'Adv': @@ -39,6 +41,8 @@ def giella2unimorph(tags): unimorphtags += ['ADJ'] elif giella == 'Adj': unimorphtags += ['ADJ'] + elif giella == 'Intj': + unimorphtags += ['INTJ'] elif giella == 'Interj': unimorphtags += ['INTJ'] elif giella == 'CC': @@ -141,6 +145,8 @@ def giella2unimorph(tags): unimorphtags += ['IND'] # XXX: can sometimes be indef? elif giella == 'Prs': unimorphtags += ['PRS'] + elif giella == 'Past': + unimorphtags += ['PST'] elif giella == 'Prt': unimorphtags += ['PST'] elif giella == 'Prt1': @@ -330,6 +336,8 @@ def giella2unimorph(tags): unimorphtags += ['OBGLIG'] elif giella == 'Interr': unimorphtags += ['INT'] # XXX: ABE? + elif giella in ['Der1', 'Der2']: + continue elif giella == 'Der/Comp': unimorphtags += ['CMPR'] elif giella == 'Comp': @@ -375,6 +383,8 @@ def giella2unimorph(tags): continue elif giella == 'ABBR': continue + elif giella == 'ACRO': + continue elif giella == 'ACR': continue elif giella == 'LEFT': @@ -397,12 +407,14 @@ def giella2unimorph(tags): continue elif giella == 'Ord': continue - elif giella in ['v1', 'v2', 'v3', 'v4', 'v5']: + elif giella in ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']: continue elif giella in ['G3', 'G7']: continue elif giella.startswith('Sem'): continue + elif giella == 'Dummytag': + continue elif giella == 'S': continue elif giella == 'Quote': @@ -448,10 +460,14 @@ def giella2unimorph(tags): unimorphtags += ['TR'] elif giella == 'Impers': unimorphtags += ['IMPRS'] + elif giella == 'Reflex': + unimorphtags += ['REFL'] elif giella == 'Refl': unimorphtags += ['REFL'] elif giella == 'Recipr': unimorphtags += ['RECP'] + elif giella == 'Distr': + unimorphtags += ['REM'] elif giella == 'Dist': unimorphtags += ['REM'] elif giella == 'Prox': @@ -462,6 +478,8 @@ def giella2unimorph(tags): elif giella == 'AssocColl': # myv continue + elif giella in ['0,0', '0,1']: + continue elif giella in ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F00', 'F01', 'F02', 'F03', 'F04', 'F05', 'F06', 'F07', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F08', @@ -480,7 +498,7 @@ def giella2unimorph(tags): 'B', 'C', 'E', 'D', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'W', 'X', 'Y', 'Z', - 'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü']: + 'Š', 'Ž', 'Ä', 'Õ', 'Ö', 'Ü', '0']: # est continue elif giella == 'Adn': @@ -540,13 +558,19 @@ def giella2unimorph(tags): elif giella.startswith('AErr/'): print('SOmething broken here½!', tags) unimorphtags += ['TYPO'] + elif '' in giella: + print('SOmething broken here½!', tags) + elif '' in giella: + print('SOmething broken here½!', tags) elif '' in giella: print('SOmething broken here½!', tags) + elif 'N224-1-9' in giella: + print('SOmething broken here½!', tags) elif '#222-5-19' in giella: print('SOmething broken here½!', tags) elif '/-' in giella: print('SOmething broken here½!', tags) - elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä']: + elif giella in ['a', 'b', 'i', 't', 'd', 's', 'n', 'ä', 'ö']: print('SOmething broken here½!', tags) elif giella in ['Ne', 'Ni', 'Nte', 'Ntee', 'Nt', 'Nti', 'Na', 'No', 'N-', 'c']: diff --git a/scripts/unimorph/generate-alphabets.bash b/scripts/unimorph/generate-alphabets.bash index 82780c0e..6e9f6557 100755 --- a/scripts/unimorph/generate-alphabets.bash +++ b/scripts/unimorph/generate-alphabets.bash @@ -20,13 +20,14 @@ if test ! -f "$generator" ; then echo "Could not find generator automaton $generator" exit 1 fi -echo "$cyclicRE +UglyHack | [? - [ $cyclicRE ] ]* ;" | - sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.regex -hfst-regexp2fst -i generative.regex -o generative.hfst -f foma -hfst-compose -F -1 generative.hfst -2 "$generator" |\ - hfst-fst2fst -f olw -o generator.hfst for c in a b c d e f g h i j k l m n o p q r s t u v x y z å ä ö š ž ; do - hfst-fst2strings -c 0 generator.hfst -p $c -done > generated.alpha -uniq < generated.alpha | "$(dirname "$0")"/convert.py + echo "$cyclicRE +UglyHack | $c [? - [ $cyclicRE ] ]* ;" | + sed -e 's/+/%+/g' -e 's:/:%/:g' -e 's/#/%#/g' -e 's/\^/%^/g' > generative.$c.regex + hfst-regexp2fst -i generative.$c.regex -o generative.$c.hfst -f foma + hfst-compose -F -1 generative.$c.hfst -2 "$generator" |\ + hfst-fst2fst -f olw -o generator.$c.hfst + hfst-fst2strings -c 0 generator.$c.hfst > generated.$c + echo $c + uniq < generated.$c | "$(dirname "$0")"/convert.py +done