diff --git a/Bundle.ecl b/Bundle.ecl index 33c6125..cb710cb 100644 --- a/Bundle.ecl +++ b/Bundle.ecl @@ -6,5 +6,5 @@ EXPORT Bundle := MODULE(Std.BundleBase) EXPORT License := 'http://www.apache.org/licenses/LICENSE-2.0'; EXPORT Copyright := 'Copyright (C) 2024 HPCC Systems'; EXPORT DependsOn := []; - EXPORT Version := '1.9.5'; + EXPORT Version := '1.10.0'; END; diff --git a/Profile.ecl b/Profile.ecl index 877d1e0..3f44a96 100644 --- a/Profile.ecl +++ b/Profile.ecl @@ -234,6 +234,8 @@ EXPORT Profile(inFile, LOCAL %AttributeType_t% := STRING36; #UNIQUENAME(NumericStat_t); LOCAL %NumericStat_t% := DECIMAL32_4; + #UNIQUENAME(RecCount_t); + LOCAL %RecCount_t% := UNSIGNED6; // Tests for enabled features #UNIQUENAME(FeatureEnabledFillRate); @@ -391,12 +393,12 @@ EXPORT Profile(inFile, // Define the record layout that will be used by the inner _Inner_Profile() call LOCAL ModeRec := RECORD UTF8 value; - UNSIGNED4 rec_count; + %RecCount_t% rec_count; END; LOCAL PatternCountRec := RECORD STRING data_pattern; - UNSIGNED4 rec_count; + %RecCount_t% rec_count; UTF8 example; END; @@ -408,11 +410,11 @@ EXPORT Profile(inFile, LOCAL OutputLayout := RECORD STRING sortValue; STRING attribute; - UNSIGNED4 rec_count; + %RecCount_t% rec_count; STRING given_attribute_type; DECIMAL9_6 fill_rate; - UNSIGNED4 fill_count; - UNSIGNED4 cardinality; + %RecCount_t% fill_count; + %RecCount_t% cardinality; DATASET(ModeRec) cardinality_breakdown {MAXCOUNT(%lowCardinalityThreshold%)}; STRING best_attribute_type; DATASET(ModeRec) modes {MAXCOUNT(%MAX_MODES%)}; @@ -442,13 +444,13 @@ EXPORT Profile(inFile, #IF(%FeatureEnabledBestECLTypes%()) STRING best_attribute_type; #END - UNSIGNED4 rec_count; + %RecCount_t% rec_count; #IF(%FeatureEnabledFillRate%()) - UNSIGNED4 fill_count; + %RecCount_t% fill_count; DECIMAL9_6 fill_rate; #END #IF(%FeatureEnabledCardinality%()) - UNSIGNED4 cardinality; + %RecCount_t% cardinality; #END #IF(%FeatureEnabledLowCardinalityBreakdown%()) DATASET(ModeRec) cardinality_breakdown; @@ -623,11 +625,24 @@ EXPORT Profile(inFile, #UNIQUENAME(_MapAllUni); LOCAL %_MapAllUni%(UNICODE s) := (STRING)%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s))); + // Pattern mapping a UTF8 datatype; using regex due to the complexity + // of the character set + #UNIQUENAME(_MapUpperCharUTF8); + LOCAL %_MapUpperCharUTF8%(UTF8 s) := REGEXREPLACE(u8'\\p{Lu}', s, u8'A'); + #UNIQUENAME(_MapLowerCharUTF8); + LOCAL %_MapLowerCharUTF8%(UTF8 s) := REGEXREPLACE(u8'[\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}]', s, u8'a'); + #UNIQUENAME(_MapDigitUTF8); + LOCAL %_MapDigitUTF8%(UTF8 s) := REGEXREPLACE(u8'[1-9]', s, u8'9'); // Leave '0' as-is and replace with '9' later + #UNIQUENAME(_MapAllUTF8); + LOCAL %_MapAllUTF8%(UTF8 s) := (STRING)%_MapDigitUTF8%(%_MapLowerCharUTF8%(%_MapUpperCharUTF8%(s))); + // Trimming strings #UNIQUENAME(_TrimmedStr); LOCAL %_TrimmedStr%(STRING s) := TRIM(s, LEFT, RIGHT); #UNIQUENAME(_TrimmedUni); LOCAL %_TrimmedUni%(UNICODE s) := TRIM(s, LEFT, RIGHT); + #UNIQUENAME(_TrimmedUTF8); + LOCAL %_TrimmedUTF8%(UTF8 s) := TRIM(s, LEFT, RIGHT); // Collect a list of the top-level attributes that we can process, // determine the actual maximum length of a data pattern (if we can @@ -711,7 +726,7 @@ EXPORT Profile(inFile, %Attribute_t% attribute; %AttributeType_t% given_attribute_type; %StringValue_t% string_value; - UNSIGNED4 value_count; + %RecCount_t% value_count; %DataPattern_t% data_pattern; UNSIGNED4 data_length; BOOLEAN is_filled; @@ -765,18 +780,24 @@ EXPORT Profile(inFile, #ELSE %_TrimmedUni%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) #END, - UNSIGNED4 value_count := COUNT(GROUP), + %RecCount_t% value_count := COUNT(GROUP), %DataPattern_t% data_pattern := #IF(%_IsSetType%(%'@type'%)) %_MapAllStr%(%_TrimmedStr%(Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', '))[..%foundMaxPatternLen%]) #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%)) %_MapAllStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) - #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%)) + #ELSEIF(REGEXFIND('unicode', %'@type'%)) #IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%) %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%]) #ELSE %_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) #END + #ELSEIF(REGEXFIND('utf', %'@type'%)) + #IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%) + %_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%]) + #ELSE + %_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))) + #END #ELSEIF(REGEXFIND('string', %'@type'%)) #IF(%@size% < 0 OR %@size% > %foundMaxPatternLen%) %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%]) @@ -1117,7 +1138,7 @@ EXPORT Profile(inFile, %filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'%), { string_value, - UNSIGNED4 rec_count := SUM(GROUP, value_count) + %RecCount_t% rec_count := SUM(GROUP, value_count) }, string_value, MERGE @@ -1287,8 +1308,8 @@ EXPORT Profile(inFile, { attribute, data_pattern, - UTF8 example := string_value[..%foundMaxPatternLen%], - UNSIGNED4 rec_count := SUM(GROUP, value_count) + UTF8 example := string_value[..%foundMaxPatternLen%], + %RecCount_t% rec_count := SUM(GROUP, value_count) }, attribute, data_pattern, MERGE @@ -1333,8 +1354,8 @@ EXPORT Profile(inFile, { attribute, given_attribute_type, - UNSIGNED4 rec_count := SUM(GROUP, value_count), - UNSIGNED4 filled_count := SUM(GROUP, IF(is_filled, value_count, 0)) + %RecCount_t% rec_count := SUM(GROUP, value_count), + %RecCount_t% filled_count := SUM(GROUP, IF(is_filled, value_count, 0)) }, attribute, given_attribute_type, MERGE @@ -1430,7 +1451,7 @@ EXPORT Profile(inFile, { %Attribute_t% attribute, BOOLEAN is_numeric, - UNSIGNED4 cardinality, + %RecCount_t% cardinality, REAL numeric_min, REAL numeric_max, REAL numeric_mean, diff --git a/README.md b/README.md index dcf1bbc..0591409 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ level, such as within your "My Files" folder. |1.9.3|Better identify upper- and lower-case Unicode characters in text patterns; scan Unicode and UTF-8 strings to see if they can be represented with a STRING data type instead| |1.9.4|README fixes and updates; improve UTF-8 detection and prevent buffer overruns during character scans; use short form of Unicode property names in regex| |1.9.5|Correct Unicode regex regression introduced in 1.9.4| +|1.10.0|Security in visualization; expand "record count" fields from UNSIGNED4 to UNSIGNED6 -- thanks to Manjunath Venkataswamy for requesting this improvement; add UTF8-specific TRIM and regex calls to avoid casting if possible| --- diff --git a/report/package-lock.json b/report/package-lock.json index cb0bd56..aaedb77 100644 --- a/report/package-lock.json +++ b/report/package-lock.json @@ -692,9 +692,9 @@ "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" }, "fsevents": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", - "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", "dev": true, "optional": true }, @@ -1078,12 +1078,12 @@ } }, "rollup": { - "version": "2.41.4", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-2.41.4.tgz", - "integrity": "sha512-f9IHfMO8p2Y8OdisI7Oj3oKkPuaQ6cgSwYqAi0TDvP3w2p+oX1VejX/w28a1h8WTnrapzfO5d4Uqhww+gL0b0g==", + "version": "3.29.5", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-3.29.5.tgz", + "integrity": "sha512-GVsDdsbJzzy4S/v3dqWPJ7EfvZJfCHiDqe80IyrF59LYuP+e6U1LJoUqeuqRbwAWoMNoXivMNeNAOf5E22VA1w==", "dev": true, "requires": { - "fsevents": "~2.3.1" + "fsevents": "~2.3.2" } }, "safe-buffer": { diff --git a/report/package.json b/report/package.json index a495833..32a5276 100644 --- a/report/package.json +++ b/report/package.json @@ -32,7 +32,7 @@ "devDependencies": { "npm-run-all": "^4.1.5", "rimraf": "^3.0.2", - "rollup": "^2.41.4", + "rollup": "^3.29.5", "@rollup/plugin-node-resolve": "^11.2.0", "terser": "^5.6.0", "tslib": "^2.1.0",