Skip to content

Commit

Permalink
compounding methods
Browse files Browse the repository at this point in the history
+ drop copy and assignment constructors, since using them (in all
existing cases) would negate the point of this rewrite
  • Loading branch information
mr-martian committed Jul 26, 2024
1 parent 04223be commit dd5899d
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 101 deletions.
20 changes: 18 additions & 2 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,20 @@ FSTProcessor::filterFinals(const State& state, UStringView casefrom)
uppercase, firstupper, 0);
}

UString
FSTProcessor::filterFinals(const ReusableState& state, UStringView casefrom)
{
bool firstupper = false, uppercase = false;
if (!dictionaryCase) {
firstupper = u_isupper(casefrom[0]);
uppercase = (casefrom.size() > 1 &&
firstupper && u_isupper(casefrom[casefrom.size()-1]));
}
return state.filterFinals(all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0);
}

void
FSTProcessor::writeEscaped(UStringView str, UFILE *output)
{
Expand Down Expand Up @@ -983,7 +997,9 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
bool last_incond = false;
bool last_postblank = false;
bool last_preblank = false;
State current_state = initial_state;
//State current_state = initial_state;
ReusableState current_state;
current_state.init(&root);
UString lf; // analysis (lexical form and tags)
UString sf; // surface form
UString lf_spcmp; // space compound analysis
Expand Down Expand Up @@ -1238,7 +1254,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
}
}

current_state = initial_state;
current_state.init(&root);
lf.clear();
sf.clear();
last_start = input_buffer.getPos();
Expand Down
1 change: 1 addition & 0 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ class FSTProcessor
* Assumes that casefrom is non-empty
*/
UString filterFinals(const State& state, UStringView casefrom);
UString filterFinals(const ReusableState& state, UStringView casefrom);

/**
* Write a string to an output stream,
Expand Down
246 changes: 154 additions & 92 deletions lttoolbox/reusable_state.cc
Original file line number Diff line number Diff line change
@@ -1,46 +1,35 @@
#include <reusable_state.h>
#include <climits>

ReusableState::ReusableState() {}

ReusableState::~ReusableState()
{
destroy();
}
#define WalkBack(var, pos, block) { \
size_t index = pos; \
while (index != 0) { \
auto& var = get(index); \
block \
index = var.prev; \
} \
}

ReusableState::ReusableState(const ReusableState& s)
{
copy(s);
}
#define StepLoop(block) { \
size_t new_start = end; \
for (size_t i = start; i < new_start; i++) { \
block \
} \
start = new_start; \
epsilonClosure(); \
}

ReusableState& ReusableState::operator=(const ReusableState& s)
{
if (this != &s) copy(s);
return *this;
}
ReusableState::ReusableState() {}

void ReusableState::destroy()
ReusableState::~ReusableState()
{
for (size_t i = 0; i < steps.size(); i++) {
delete steps[i];
}
steps.clear();
}

void ReusableState::copy(const ReusableState& s)
{
destroy();
size_t N = (s.end >> STATE_STEP_BLOCK_SIZE_EXP) + 1;
steps.reserve(N);
for (size_t i = 0; i < N; i++) {
auto block = new std::array<Step,STATE_STEP_BLOCK_SIZE>;
*block = *(s.steps[i]);
steps.push_back(block);
}
start = s.start;
end = s.end;
}

ReusableState::Step& ReusableState::create(size_t index)
ReusableState::Step& ReusableState::get_or_create(size_t index)
{
size_t a = index >> STATE_STEP_BLOCK_SIZE_EXP;
size_t b = index & (STATE_STEP_BLOCK_SIZE-1);
Expand All @@ -67,7 +56,7 @@ bool ReusableState::apply(int32_t input, size_t pos,
it = prev.where->transitions.find(input);
if (it != prev.where->transitions.end()) {
for (int j = 0; j < it->second.size; j++) {
Step& next = create(end);
Step& next = get_or_create(end);
next.where = it->second.dest[j];
next.symbol = it->second.out_tag[j];
if (old_sym && next.symbol == old_sym) next.symbol = new_sym;
Expand Down Expand Up @@ -101,18 +90,25 @@ void ReusableState::init(Node* initial)
}
start = 0;
end = 1;
create(0).where = initial;
get_or_create(0).where = initial;
epsilonClosure();
}

void ReusableState::step(int32_t input)
void ReusableState::reinit(Node* initial)
{
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, 0, 0, false);
}
start = new_start;
size_t start_was = start;
get_or_create(end).where = initial;
start = end;
end++;
epsilonClosure();
start = start_was;
}

void ReusableState::step(int32_t input)
{
StepLoop({
apply(input, i, 0, 0, false);
})
}

void ReusableState::step(int32_t input, int32_t alt)
Expand All @@ -121,24 +117,18 @@ void ReusableState::step(int32_t input, int32_t alt)
step(input);
return;
}
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, 0, 0, false);
apply(alt, i, 0, 0, true);
}
start = new_start;
epsilonClosure();
StepLoop({
apply(input, i, 0, 0, false);
apply(alt, i, 0, 0, true);
})
}

void ReusableState::step_override(int32_t input,
int32_t old_sym, int32_t new_sym)
{
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, old_sym, new_sym, false);
}
start = new_start;
epsilonClosure();
StepLoop({
apply(input, i, old_sym, new_sym, false);
})
}

void ReusableState::step_override(int32_t input, int32_t alt,
Expand All @@ -148,13 +138,10 @@ void ReusableState::step_override(int32_t input, int32_t alt,
step_override(input, old_sym, new_sym);
return;
}
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, old_sym, new_sym, false);
apply(alt, i, old_sym, new_sym, true);
}
start = new_start;
epsilonClosure();
StepLoop({
apply(input, i, old_sym, new_sym, false);
apply(alt, i, old_sym, new_sym, true);
})
}

void ReusableState::step_careful(int32_t input, int32_t alt)
Expand All @@ -163,14 +150,11 @@ void ReusableState::step_careful(int32_t input, int32_t alt)
step(input);
return;
}
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
if (!apply(input, i, 0, 0, false)) {
apply(alt, i, 0, 0, true);
}
}
start = new_start;
epsilonClosure();
StepLoop({
if (!apply(input, i, 0, 0, false)) {
apply(alt, i, 0, 0, true);
}
})
}

void ReusableState::step(int32_t input, int32_t alt1, int32_t alt2)
Expand All @@ -182,28 +166,22 @@ void ReusableState::step(int32_t input, int32_t alt1, int32_t alt2)
step(input, alt1);
return;
}
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, 0, 0, false);
apply(alt1, i, 0, 0, true);
apply(alt2, i, 0, 0, true);
}
start = new_start;
epsilonClosure();
StepLoop({
apply(input, i, 0, 0, false);
apply(alt1, i, 0, 0, true);
apply(alt2, i, 0, 0, true);
})
}

void ReusableState::step(int32_t input, std::set<int> alts)
{
size_t new_start = end;
for (size_t i = start; i < new_start; i++) {
apply(input, i, 0, 0, false);
for (auto& a : alts) {
if (a == 0 || a == input) continue;
apply(a, i, 0, 0, true);
}
}
start = new_start;
epsilonClosure();
StepLoop({
apply(input, i, 0, 0, false);
for (auto& a : alts) {
if (a == 0 || a == input) continue;
apply(a, i, 0, 0, true);
}
})
}

void ReusableState::step_case(UChar32 val, UChar32 val2, bool caseSensitive)
Expand Down Expand Up @@ -252,14 +230,11 @@ void ReusableState::extract(size_t pos, UString& result, double& weight,
const Alphabet& alphabet,
const std::set<UChar32>& escaped_chars,
bool uppercase) const {
size_t i = pos;
std::vector<int32_t> symbols;
while (i != 0) {
auto& it = get(i);
weight += it.weight;
if (it.symbol) symbols.push_back(it.symbol);
i = it.prev;
}
WalkBack(it, pos, {
weight += it.weight;
if (it.symbol) symbols.push_back(it.symbol);
})
for (auto it = symbols.rbegin(); it != symbols.rend(); it++) {
if (escaped_chars.find(*it) != escaped_chars.end()) result += '\\';
alphabet.getSymbol(result, *it, uppercase);
Expand Down Expand Up @@ -333,3 +308,90 @@ UString ReusableState::filterFinals(const std::map<Node*, double>& finals,
}
return temp;
}

bool ReusableState::lastPartHasRequiredSymbol(size_t pos, int32_t symbol,
int32_t separator)
{
WalkBack(it, pos, {
if (it.symbol == symbol) return true;
else if (separator && it.symbol == separator) return false;
});
return false;
}

bool ReusableState::hasSymbol(int32_t symbol)
{
for (size_t i = start; i < end; i++) {
if (lastPartHasRequiredSymbol(i, symbol, 0)) return true;
}
return false;
}

void ReusableState::pruneCompounds(int32_t requiredSymbol, int32_t separator,
int maxElements)
{
int min = maxElements;
size_t len = size();
std::vector<int> count(len, 0);
for (size_t i = 0; i < len; i++) {
bool found = false;
WalkBack(it, i+start, {
if (it.symbol == requiredSymbol && count[i] == 0) found = true;
else if (it.symbol == separator) {
if (found) count[i]++;
else {
count[i] = INT_MAX;
break;
}
}
});
if (count[i] < min) min = count[i];
}
size_t keep = 0;
for (size_t i = 0; i < len; i++) {
if (count[i] == min) {
size_t src = start + i;
size_t dest = start + keep;
// move the step that we're keeping, overwriting one that's being
// discarded, and shrink the state size
if (src != dest) get_or_create(dest) = get(src);
keep++;
}
}
end = start + keep;
}

void ReusableState::restartFinals(const std::map<Node*, double>& finals,
int32_t requiredSymbol, Node* restart,
int32_t separator)
{
if (restart == nullptr) return;
for (size_t i = start, limit = end; i < limit; i++) {
auto& step = get(i);
if (finals.count(step.where) > 0 &&
lastPartHasRequiredSymbol(i, requiredSymbol, separator)) {
size_t start_was = start;
start = end;
end++;
auto& newstep = get_or_create(start);
newstep.where = restart;
newstep.symbol = separator;
newstep.prev = i;
epsilonClosure();
start = start_was;
}
}
}

void ReusableState::pruneStatesWithForbiddenSymbol(int32_t symbol)
{
size_t keep = 0;
for (size_t i = start; i < end; i++) {
if (!lastPartHasRequiredSymbol(i, symbol, 0)) {
size_t dest = start + keep;
if (i != dest) get_or_create(dest) = get(i);
keep++;
}
}
end = start + keep;
}
Loading

0 comments on commit dd5899d

Please sign in to comment.