diff --git a/src/cscout.cpp b/src/cscout.cpp index 17ccc1a..b5c6666 100644 --- a/src/cscout.cpp +++ b/src/cscout.cpp @@ -3346,16 +3346,21 @@ merge_tokens(char **argv) /* * Example invocation: - * cscout -M eclasses-5.txt ids-5.txt functionids-5.txt \ - * 0 1 2 - * new-eclasses-5.csv new-ids-5.csv new-functioids-5.csv - * 3 4 5 + * cscout -M \ + * eclasses-a-5.txt \ # 0 + * eclasses-a-5.txt \ # 1 + * ids-5.txt \ # 2 + * functionids-5.txt \ # 3 + * new-eclasses-5.csv \ # 4 + * new-ids-5.csv \ # 5 + * new-functionds-5.csv # 6 */ - Dbtoken::read_eclasses(argv[0]); - Dbtoken::write_eclasses(argv[3]); - Dbtoken::read_ids(argv[1]); - Dbtoken::write_ids(argv[1], argv[4]); - Dbtoken::read_write_functionids(argv[2], argv[5]); + Dbtoken::add_eclasses_attached(argv[0]); + Dbtoken::process_eclasses_original(argv[1]); + Dbtoken::write_eclasses(argv[4]); + Dbtoken::read_ids(argv[2]); + Dbtoken::write_ids(argv[2], argv[5]); + Dbtoken::read_write_functionids(argv[3], argv[6]); exit(0); } diff --git a/src/dbtoken.cpp b/src/dbtoken.cpp index 7607771..3182655 100644 --- a/src/dbtoken.cpp +++ b/src/dbtoken.cpp @@ -68,9 +68,10 @@ unify_and_clear(Dbtoken &found, Dbtoken &read) static int line_number; static void -warn(const char *file_name, Tokid ti) +warn(const char *file_name, const string &context, Tokid ti) { - cerr << file_name << '(' << line_number << "): missing EC for file " + cerr << file_name << '(' << line_number << "): " << context + << ": missing EC for file " << ti.get_fileid().get_id() << " offset " << (unsigned)ti.get_streampos() << "\n"; } @@ -99,9 +100,28 @@ twin(const Tokid &ti) return Tokid(Fileid(-ti.get_fileid().get_id()), ti.get_streampos()); } -// Read tokids and their equivalence classes +// Return the Eclass for the specified tokid or its twin. +// If it isn't found return NULL. +static Eclass * +check_ec(const Tokid &ti) +{ + Eclass *ec = ti.check_ec(); + + if (ec != NULL) + return ec; + + // Try the twin + return twin(ti).check_ec(); +} + +/* + * First, read tokids and their equivalence classes from the attached + * (often larger) file. + * As ECs come from a single file, all are guaranteed to be unique + * and non-overlapping. Therefore, Create ECs or add entries to existing ECs. + */ void -Dbtoken::read_eclasses(const char *in_path) +Dbtoken::add_eclasses_attached(const char *in_path) { Fileid::disable_filedetails(); Project::set_current_project("dbmerge"); @@ -109,32 +129,8 @@ Dbtoken::read_eclasses(const char *in_path) ifstream input(in_path); verify_open(in_path, input); - Eclass *ec = NULL, *prev_ec = NULL; // EC pointer as created / found + Eclass *ec = NULL; // EC pointer as created / found long ecid, prev_ecid = 0; // EC identifier read from file - enum { - /* - * Reading from attached database (id != 0). This comes first - * and its ECs are all fresh. - */ - s_read_first, - /* - * Reading from the original database (id == 0). - * This comes second in the file, may be smaller, - * and may have some of its ECs repeated in the attached - * database, which comes first. - */ - s_read_second, // Clean slate to process the next entry - s_read_ov, // Read entry overflowed a matching EC - s_found_ov, // A matched EC overflowed the read entry - } state = s_read_first; - - /* - * Invariant: found_minus_read always contains difference in the - * length of characters stored in the above "found" and "read" - * variables. - */ - Dbtoken found, read; - int found_minus_read = 0; string line_record; line_number = 0; @@ -142,175 +138,190 @@ Dbtoken::read_eclasses(const char *in_path) line_number++; istringstream line_stream(line_record); - int dbid; int fid; unsigned long offset; int len; - if (!(line_stream >> dbid >> fid >> offset >> len >> ecid)) { + if (!(line_stream >> fid >> offset >> len >> ecid)) { warn(in_path, line_record); continue; } - // fids from second read part are stored -ve to avoid clashes - if (state != s_read_first) - fid = -fid; - Tokid ti(Fileid(fid), offset); -state_process: if (DP()) - cout << "State " << state << " line_number " << line_number << " ti " << ti << '\n'; - switch (state) { - case s_read_first: - if (dbid == 0) { - // Entered second batch - state = s_read_second; - prev_ecid = 0; - prev_ec = NULL; - // Represent second part tokids with -ve fid - ti = twin(ti); - goto state_process; - } + cout << in_path << '(' << line_number << ") ti " << ti << '\n'; - if (ecid == prev_ecid) { - // Same EC as previous entry - ec->add_tokid(ti); - continue; - } + if (ecid == prev_ecid) { + // Same EC as previous entry + ec->add_tokid(ti); + continue; + } + + ec = new Eclass(ti, len); + prev_ecid = ecid; + } +} + +/* + * Second, read tokids and their equivalence classes from the original + * (sometimes empty) file. + * Create ECs or add entries to existing ECs using as an identifier the + * twin tokid to keep them distinct from the attached ones. + */ +static void +add_eclasses_original(const char *in_path) +{ - ec = new Eclass(ti, len); + ifstream input(in_path); + verify_open(in_path, input); + + Eclass *ec = NULL; // EC pointer as created / found + long ecid, prev_ecid = 0; // EC identifier read from file + + string line_record; + line_number = 0; + while (getline(input, line_record)) { + line_number++; + + istringstream line_stream(line_record); + int fid; + unsigned long offset; + int len; + if (!(line_stream >> fid >> offset >> len >> ecid)) { + warn(in_path, line_record); + continue; + } + + Tokid ti(Fileid(fid), offset); + + if (DP()) + cout << in_path << '(' << line_number << ") ti " << ti << '\n'; + + if (ecid == prev_ecid) { + ec->add_tokid(twin(ti)); if (DP()) - cout << "Create 1st " << ec << " tokid " << ti << "\n"; - break; - case s_read_second: - ec = twin(ti).check_ec(); - if (ec == NULL) { - if (DP()) - cout << "No EC for " << ti << '\n'; - if (ecid == prev_ecid) { - // Unknown EC is same class: - // Add it to existing class - ec = prev_ec; - ec->add_tokid(ti); - if (DP()) - cout << "Add to " << ec << " tokid " << ti << "\n"; - } else { - // Add clean new EC at start of an id - ec = new Eclass(ti, len); - if (DP()) - cout << "Create 2nd " << ec << " tokid " << ti << "\n"; - } - break; - } + cout << "Add original ti " << twin(ti) << '\n'; + continue; + } - // From here on existing EC != NULL + // Add ECs for not found ones + ec = ti.check_ec(); + if (ec == nullptr || ec->get_len() != len) { + // fids from original are stored -ve to avoid clashes + ec = new Eclass(twin(ti), len); if (DP()) - cout << "In read_second EC_len=" << ec->get_len() << " read_len=" << len << '\n'; - if (ec->get_len() == len) { - // The 2nd-read EC joins two 1st-read ECs - if (ecid == prev_ecid && ec != prev_ec) - merge_into(ec, prev_ec); - // In remaining 3 Boolean cases, do nothing - break; - } + cout << "Create original ti " << twin(ti) << '\n'; + } + prev_ecid = ecid; + } +} + + +/* + * Third, go through the original eclasses looking for twins + * (of different length). + * Put together tokens of the same length, and unify them. + * + * Got through the file rather than the EC map, because the map changes + * as tokens get unified. + */ +static void +merge_eclasses_original(const char *in_path) +{ + + ifstream input(in_path); + verify_open(in_path, input); + + string line_record; + line_number = 0; + while (getline(input, line_record)) { + line_number++; + + istringstream line_stream(line_record); + long ecid; // EC identifier read from file + int fid; + unsigned long offset; + int len; + if (!(line_stream >> fid >> offset >> len >> ecid)) { + warn(in_path, line_record); + continue; + } + + // Initialize a possibly attached ti, and the original, + // which is -ve. + Tokid attached_ti(Fileid(fid), offset); + Tokid original_ti(twin(attached_ti)); + + if (DP()) + cout << in_path << '(' << line_number << ") ti " + << attached_ti << '\n'; + + Eclass *ec_attached = attached_ti.check_ec(); + if (!ec_attached) + continue; + + Eclass *ec_original = original_ti.check_ec(); + // It may bot have been added, because it was in attached + if (!ec_original) + continue; - // From here on tokid overlaps complicate things. + // Assemble tokids into two equal-length tokens to unify + Dbtoken original; // -ve fids; same as ti + original.add_part(original_ti, ec_original->get_len()); + int original_len = ec_original->get_len(); - // 1.Store parts in tokens to unify. - read.add_part(ti, len); - found_minus_read = -len; + Dbtoken attached; // +ve fids + attached.add_part(attached_ti, ec_attached->get_len()); + int attached_len = ec_attached->get_len(); - // 2. Adjust state based on overlap direction - if (len > ec->get_len()) { - state = s_read_ov; - ti = twin(ti); - goto state_process; + while (original_len != attached_len) { + if (DP()) { + cout << "Tokid: " << attached_ti << '\n'; + cout << "Original: " << original << '\n'; + cout << "Attached: " << attached << '\n'; } - found.add_part(twin(ti), ec->get_len()); - found_minus_read += ec->get_len(); - state = s_found_ov; - break; - case s_read_ov: - /* - * The read data overflowed the found EC; - * continue checking ECs until they cover it. - * Preconditions: - * - ti points at the beginning of the found area, - * with a positive fid. - * - The "read" parts contains the read parts up - * to and including the one that overlapped. - * - The "found" parts contains alla apart from - * the one that will overlap. - */ - if (DP()) - cout << "In read_ov found_minus_read=" << found_minus_read << " ti=" << ti << '\n'; - for (;;) { - ec = ti.check_ec(); - if (ec == NULL) { - warn(in_path, ti); - found_minus_read = 0; - break; + if (original_len > attached_len) { + // Find and add an attached EC + attached_ti += ec_attached->get_len(); + ec_attached = attached_ti.check_ec(); + if (!ec_attached) { + warn(in_path, "Obtain next attached EC", attached_ti); + original_len = attached_len; } - if (DP()) - cout << "In read_ov found=" << found << '\n'; - found.add_part(ti, ec->get_len()); - found_minus_read += ec->get_len(); - if (found_minus_read >= 0) - break; - ti += ec->get_len(); - if (DP()) - cout << "In read_ov set ti " << ti << '\n'; - } - - if (found_minus_read == 0) { - unify_and_clear(found, read); - state = s_read_second; - } else - state = s_found_ov; - break; - case s_found_ov: - /* - * The found EC overflowed the read data; - * continue reading parts until the found EC is - * covered. - * Preconditions: - * - ti points at the element read after the overflow, - * with a negative fid. - * - len contains the length of the newly read item. - * - The found and read lists contain the parts that - * partially overlapped. - */ - if (DP()) - cout << "In found_ov found_minus_read=" << found_minus_read << '\n'; - read.add_part(ti, len); - int adjust = found_minus_read; - found_minus_read -= len; - if (DP()) - cout << "In found_ov set found_minus_read " << found_minus_read << '\n'; - if (found_minus_read < 0) { - ti += adjust; - ti = twin(ti); - ec = ti.check_ec(); - if (ec == NULL) { - warn(in_path, ti); - found_minus_read = 0; - break; + attached.add_part(attached_ti, ec_attached->get_len()); + attached_len += ec_attached->get_len(); + } else { + // Find and add an original EC + original_ti += ec_original->get_len(); + ec_original = original_ti.check_ec(); + if (!ec_original) { + warn(in_path, "Obtain next original EC", original_ti); + attached_len = original_len; } - state = s_read_ov; - goto state_process; + original.add_part(original_ti, ec_original->get_len()); + original_len += ec_original->get_len(); } - if (found_minus_read == 0) { - unify_and_clear(found, read); - state = s_read_second; - } - // Otherwise found_minus_read in found_ov - break; } - prev_ecid = ecid; - prev_ec = ec; + + unify_and_clear(original, attached); } } +/* + * Read and merge tokids and ECs from the original database + * (sometimes empty) file. + */ +void +Dbtoken::process_eclasses_original(const char *in_path) +{ + Fileid::disable_filedetails(); + Project::set_current_project("dbmerge"); + add_eclasses_original(in_path); + if (DP()) + cout << "Start merge pass\n"; + merge_eclasses_original(in_path); +} + // Output tokids and their equivalence classes to file named f void Dbtoken::write_eclasses(const char *out_path) @@ -330,7 +341,9 @@ Dbtoken::write_eclasses(const char *out_path) // Mirror EC exists; ignore if (ec_twin->get_len() != ec->get_len()) { if (DP()) - cout << "Non-matching ECs:\nFound: " << *ec << "Read: " << *ec_twin; + cout << "Non-matching ECs:" + << "\nFound: " << *ec << " size: " << ec->get_size() + << "\nRead: " << *ec_twin << " size: " << ec_twin->get_size() << '\n'; csassert(ec_twin->get_len() == ec->get_len()); } continue; @@ -346,20 +359,6 @@ Dbtoken::write_eclasses(const char *out_path) } } -// Return the Eclass for the specified tokid or its twin. -// If it isn't found return NULL. -static Eclass * -check_ec(const Tokid &ti) -{ - Eclass *ec = ti.check_ec(); - - if (ec != NULL) - return ec; - - // Try the twin - return twin(ti).check_ec(); -} - // Read identifiers from in_path and set the EC attributes void Dbtoken::read_ids(const char *in_path) @@ -414,7 +413,7 @@ Dbtoken::read_ids(const char *in_path) Eclass *ec = check_ec(ti); if (ec == NULL) { - warn(in_path, ti); + warn(in_path, "Obtain id EC", ti); goto next_line; } @@ -512,7 +511,7 @@ Dbtoken::write_ids(const char *in_path, const char *out_path) Eclass *ec = check_ec(ti); if (ec == NULL) { - warn(in_path, ti); + warn(in_path, "Obtain next id EC", ti); goto next_line; } @@ -546,7 +545,7 @@ Dbtoken::read_write_functionids(const char *in_path, const char *out_path) while (getline(in, line_record)) { line_number++; if (DP()) - cout << "Read: " << line_record << '\n'; + cout << in_path << '(' << line_number << "): " << line_record << '\n'; istringstream line_stream(line_record); int functionid, in_ordinal; @@ -577,7 +576,7 @@ Dbtoken::read_write_functionids(const char *in_path, const char *out_path) Eclass *ec = check_ec(ti); if (ec == NULL) { - warn(in_path, ti); + warn(in_path, "Obtain function EC", ti); goto next_line; } diff --git a/src/dbtoken.h b/src/dbtoken.h index f8daddf..076714e 100644 --- a/src/dbtoken.h +++ b/src/dbtoken.h @@ -58,7 +58,8 @@ class Dbtoken : public Token { // Read/write tokids and their eids from file named f - static void read_eclasses(const char *f); + static void add_eclasses_attached(const char *f); + static void process_eclasses_original(const char *f); // Output tokids and their equivalence classes to file named f static void write_eclasses(const char *f);