diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 2d21a55..c074eb7 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -688,6 +688,20 @@ FSTProcessor::filterFinals(const State& state, UStringView casefrom) uppercase, firstupper, 0); } +UString +FSTProcessor::filterFinals(const ReusableState& state, UStringView casefrom) +{ + bool firstupper = false, uppercase = false; + if (!dictionaryCase) { + firstupper = u_isupper(casefrom[0]); + uppercase = (casefrom.size() > 1 && + firstupper && u_isupper(casefrom[casefrom.size()-1])); + } + return state.filterFinals(all_finals, alphabet, escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0); +} + void FSTProcessor::writeEscaped(UStringView str, UFILE *output) { @@ -983,7 +997,9 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) bool last_incond = false; bool last_postblank = false; bool last_preblank = false; - State current_state = initial_state; + //State current_state = initial_state; + ReusableState current_state; + current_state.init(&root); UString lf; // analysis (lexical form and tags) UString sf; // surface form UString lf_spcmp; // space compound analysis @@ -1238,7 +1254,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) } } - current_state = initial_state; + current_state.init(&root); lf.clear(); sf.clear(); last_start = input_buffer.getPos(); diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index f800f17..3eeb6fa 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -324,6 +324,7 @@ class FSTProcessor * Assumes that casefrom is non-empty */ UString filterFinals(const State& state, UStringView casefrom); + UString filterFinals(const ReusableState& state, UStringView casefrom); /** * Write a string to an output stream, diff --git a/lttoolbox/reusable_state.cc b/lttoolbox/reusable_state.cc index c438fa5..aa55c95 100644 --- a/lttoolbox/reusable_state.cc +++ b/lttoolbox/reusable_state.cc @@ -1,24 +1,27 @@ #include +#include -ReusableState::ReusableState() {} - -ReusableState::~ReusableState() -{ - destroy(); -} +#define WalkBack(var, pos, block) { \ + size_t index = pos; \ + while (index != 0) { \ + auto& var = get(index); \ + block \ + index = var.prev; \ + } \ + } -ReusableState::ReusableState(const ReusableState& s) -{ - copy(s); -} +#define StepLoop(block) { \ + size_t new_start = end; \ + for (size_t i = start; i < new_start; i++) { \ + block \ + } \ + start = new_start; \ + epsilonClosure(); \ + } -ReusableState& ReusableState::operator=(const ReusableState& s) -{ - if (this != &s) copy(s); - return *this; -} +ReusableState::ReusableState() {} -void ReusableState::destroy() +ReusableState::~ReusableState() { for (size_t i = 0; i < steps.size(); i++) { delete steps[i]; @@ -26,21 +29,7 @@ void ReusableState::destroy() steps.clear(); } -void ReusableState::copy(const ReusableState& s) -{ - destroy(); - size_t N = (s.end >> STATE_STEP_BLOCK_SIZE_EXP) + 1; - steps.reserve(N); - for (size_t i = 0; i < N; i++) { - auto block = new std::array; - *block = *(s.steps[i]); - steps.push_back(block); - } - start = s.start; - end = s.end; -} - -ReusableState::Step& ReusableState::create(size_t index) +ReusableState::Step& ReusableState::get_or_create(size_t index) { size_t a = index >> STATE_STEP_BLOCK_SIZE_EXP; size_t b = index & (STATE_STEP_BLOCK_SIZE-1); @@ -67,7 +56,7 @@ bool ReusableState::apply(int32_t input, size_t pos, it = prev.where->transitions.find(input); if (it != prev.where->transitions.end()) { for (int j = 0; j < it->second.size; j++) { - Step& next = create(end); + Step& next = get_or_create(end); next.where = it->second.dest[j]; next.symbol = it->second.out_tag[j]; if (old_sym && next.symbol == old_sym) next.symbol = new_sym; @@ -101,18 +90,25 @@ void ReusableState::init(Node* initial) } start = 0; end = 1; - create(0).where = initial; + get_or_create(0).where = initial; epsilonClosure(); } -void ReusableState::step(int32_t input) +void ReusableState::reinit(Node* initial) { - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, 0, 0, false); - } - start = new_start; + size_t start_was = start; + get_or_create(end).where = initial; + start = end; + end++; epsilonClosure(); + start = start_was; +} + +void ReusableState::step(int32_t input) +{ + StepLoop({ + apply(input, i, 0, 0, false); + }) } void ReusableState::step(int32_t input, int32_t alt) @@ -121,24 +117,18 @@ void ReusableState::step(int32_t input, int32_t alt) step(input); return; } - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, 0, 0, false); - apply(alt, i, 0, 0, true); - } - start = new_start; - epsilonClosure(); + StepLoop({ + apply(input, i, 0, 0, false); + apply(alt, i, 0, 0, true); + }) } void ReusableState::step_override(int32_t input, int32_t old_sym, int32_t new_sym) { - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, old_sym, new_sym, false); - } - start = new_start; - epsilonClosure(); + StepLoop({ + apply(input, i, old_sym, new_sym, false); + }) } void ReusableState::step_override(int32_t input, int32_t alt, @@ -148,13 +138,10 @@ void ReusableState::step_override(int32_t input, int32_t alt, step_override(input, old_sym, new_sym); return; } - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, old_sym, new_sym, false); - apply(alt, i, old_sym, new_sym, true); - } - start = new_start; - epsilonClosure(); + StepLoop({ + apply(input, i, old_sym, new_sym, false); + apply(alt, i, old_sym, new_sym, true); + }) } void ReusableState::step_careful(int32_t input, int32_t alt) @@ -163,14 +150,11 @@ void ReusableState::step_careful(int32_t input, int32_t alt) step(input); return; } - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - if (!apply(input, i, 0, 0, false)) { - apply(alt, i, 0, 0, true); - } - } - start = new_start; - epsilonClosure(); + StepLoop({ + if (!apply(input, i, 0, 0, false)) { + apply(alt, i, 0, 0, true); + } + }) } void ReusableState::step(int32_t input, int32_t alt1, int32_t alt2) @@ -182,28 +166,22 @@ void ReusableState::step(int32_t input, int32_t alt1, int32_t alt2) step(input, alt1); return; } - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, 0, 0, false); - apply(alt1, i, 0, 0, true); - apply(alt2, i, 0, 0, true); - } - start = new_start; - epsilonClosure(); + StepLoop({ + apply(input, i, 0, 0, false); + apply(alt1, i, 0, 0, true); + apply(alt2, i, 0, 0, true); + }) } void ReusableState::step(int32_t input, std::set alts) { - size_t new_start = end; - for (size_t i = start; i < new_start; i++) { - apply(input, i, 0, 0, false); - for (auto& a : alts) { - if (a == 0 || a == input) continue; - apply(a, i, 0, 0, true); - } - } - start = new_start; - epsilonClosure(); + StepLoop({ + apply(input, i, 0, 0, false); + for (auto& a : alts) { + if (a == 0 || a == input) continue; + apply(a, i, 0, 0, true); + } + }) } void ReusableState::step_case(UChar32 val, UChar32 val2, bool caseSensitive) @@ -252,14 +230,11 @@ void ReusableState::extract(size_t pos, UString& result, double& weight, const Alphabet& alphabet, const std::set& escaped_chars, bool uppercase) const { - size_t i = pos; std::vector symbols; - while (i != 0) { - auto& it = get(i); - weight += it.weight; - if (it.symbol) symbols.push_back(it.symbol); - i = it.prev; - } + WalkBack(it, pos, { + weight += it.weight; + if (it.symbol) symbols.push_back(it.symbol); + }) for (auto it = symbols.rbegin(); it != symbols.rend(); it++) { if (escaped_chars.find(*it) != escaped_chars.end()) result += '\\'; alphabet.getSymbol(result, *it, uppercase); @@ -333,3 +308,90 @@ UString ReusableState::filterFinals(const std::map& finals, } return temp; } + +bool ReusableState::lastPartHasRequiredSymbol(size_t pos, int32_t symbol, + int32_t separator) +{ + WalkBack(it, pos, { + if (it.symbol == symbol) return true; + else if (separator && it.symbol == separator) return false; + }); + return false; +} + +bool ReusableState::hasSymbol(int32_t symbol) +{ + for (size_t i = start; i < end; i++) { + if (lastPartHasRequiredSymbol(i, symbol, 0)) return true; + } + return false; +} + +void ReusableState::pruneCompounds(int32_t requiredSymbol, int32_t separator, + int maxElements) +{ + int min = maxElements; + size_t len = size(); + std::vector count(len, 0); + for (size_t i = 0; i < len; i++) { + bool found = false; + WalkBack(it, i+start, { + if (it.symbol == requiredSymbol && count[i] == 0) found = true; + else if (it.symbol == separator) { + if (found) count[i]++; + else { + count[i] = INT_MAX; + break; + } + } + }); + if (count[i] < min) min = count[i]; + } + size_t keep = 0; + for (size_t i = 0; i < len; i++) { + if (count[i] == min) { + size_t src = start + i; + size_t dest = start + keep; + // move the step that we're keeping, overwriting one that's being + // discarded, and shrink the state size + if (src != dest) get_or_create(dest) = get(src); + keep++; + } + } + end = start + keep; +} + +void ReusableState::restartFinals(const std::map& finals, + int32_t requiredSymbol, Node* restart, + int32_t separator) +{ + if (restart == nullptr) return; + for (size_t i = start, limit = end; i < limit; i++) { + auto& step = get(i); + if (finals.count(step.where) > 0 && + lastPartHasRequiredSymbol(i, requiredSymbol, separator)) { + size_t start_was = start; + start = end; + end++; + auto& newstep = get_or_create(start); + newstep.where = restart; + newstep.symbol = separator; + newstep.prev = i; + epsilonClosure(); + start = start_was; + } + } +} + +void ReusableState::pruneStatesWithForbiddenSymbol(int32_t symbol) +{ + size_t keep = 0; + for (size_t i = start; i < end; i++) { + if (!lastPartHasRequiredSymbol(i, symbol, 0)) { + size_t dest = start + keep; + if (i != dest) get_or_create(dest) = get(i); + keep++; + } + } + end = start + keep; +} diff --git a/lttoolbox/reusable_state.h b/lttoolbox/reusable_state.h index 14901c8..94a39fe 100644 --- a/lttoolbox/reusable_state.h +++ b/lttoolbox/reusable_state.h @@ -27,9 +27,7 @@ class ReusableState { size_t start = 0; size_t end = 1; - void destroy(); - - Step& create(size_t index); + Step& get_or_create(size_t index); const Step& get(size_t index) const; bool apply(int32_t input, size_t pos, int32_t old_sym, int32_t new_sym, @@ -44,13 +42,10 @@ class ReusableState { public: ReusableState(); ~ReusableState(); - ReusableState(const ReusableState& s); - ReusableState& operator =(const ReusableState& s); - - void copy(const ReusableState& s); size_t size() const; void init(Node* initial); + void reinit(Node* initial); void step(int32_t input); void step(int32_t input, int32_t alt); @@ -74,6 +69,15 @@ class ReusableState { int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar = 0) const; + + bool lastPartHasRequiredSymbol(size_t pos, int32_t symbol, int32_t separator); + bool hasSymbol(int32_t symbol); + void pruneCompounds(int32_t requiredSymbol, int32_t separator, + int maxElements); + void restartFinals(const std::map& finals, + int32_t requiredSymbol, Node* restart_state, + int32_t separator); + void pruneStatesWithForbiddenSymbol(int32_t symbol); }; #endif