From e7fa53dee6e7b7923743492c2498551c6a44c955 Mon Sep 17 00:00:00 2001 From: Giacomo Bergami Date: Wed, 19 Jun 2024 21:51:34 +0100 Subject: [PATCH] Latest version --- benchmark_model_size.csv | 74 --- benchmark_poly.csv | 10 + expand_elements.sh | 3 + .../knobab/mining/polyadic/polyadic_bolt.h | 57 ++- new_pipeline.cpp | 1 + poly_bench.sh | 6 +- .../02_crawl_single_model.py | 196 ++++++++ .../04_learn_from_cpp_csvs.py | 126 +++++ .../__pycache__/utils.cpython-310.pyc | Bin 0 -> 5923 bytes polyadic_preprocessing/crawl_single_model.py | 59 --- polyadic_preprocessing/external/__init__.py | 0 .../external/rocket_training.py | 9 + .../external/windowshap/README.md | 17 + .../external/windowshap/__init__.py | 0 .../external/windowshap/windowshap.py | 444 ++++++++++++++++++ polyadic_preprocessing/learn_from_cpp_csvs.py | 72 --- polyadic_preprocessing/log_json_to_numpy.py | 145 ++++++ polyadic_preprocessing/medical_analysis.py | 2 +- ....py => stats_crawl_results_model_stats.py} | 8 +- polyadic_preprocessing/utils.py | 36 ++ 20 files changed, 1038 insertions(+), 227 deletions(-) delete mode 100644 benchmark_model_size.csv create mode 100755 expand_elements.sh create mode 100644 polyadic_preprocessing/02_crawl_single_model.py create mode 100644 polyadic_preprocessing/04_learn_from_cpp_csvs.py create mode 100644 polyadic_preprocessing/__pycache__/utils.cpython-310.pyc delete mode 100644 polyadic_preprocessing/crawl_single_model.py create mode 100644 polyadic_preprocessing/external/__init__.py create mode 100644 polyadic_preprocessing/external/rocket_training.py create mode 100644 polyadic_preprocessing/external/windowshap/README.md create mode 100644 polyadic_preprocessing/external/windowshap/__init__.py create mode 100644 polyadic_preprocessing/external/windowshap/windowshap.py delete mode 100644 polyadic_preprocessing/learn_from_cpp_csvs.py create mode 100644 polyadic_preprocessing/log_json_to_numpy.py rename polyadic_preprocessing/{crawl_results_model_stats.py => stats_crawl_results_model_stats.py} (98%) diff --git a/benchmark_model_size.csv b/benchmark_model_size.csv deleted file mode 100644 index 77c5db0c..00000000 --- a/benchmark_model_size.csv +++ /dev/null @@ -1,74 +0,0 @@ -filename_polyadic,mining_supp,reduction,reclassify,isFilenamePolyadic,cpp_preprocess,loading,indexing,mining,refining,0,1 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,1099.4,1689.05,246.804,7251.89,114.59,9439,479 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,1141.24,2007.94,253.875,8002.66,0,533732,75820 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,1219.53,2004.85,359.461,9408.3,122.549,7610,173 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,1133.01,1758.15,259.612,7604.47,0,530206,75139 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,1120.33,1761.42,273.884,3126.8,62.5259,6498,478 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,1157.08,1886.24,260.621,3512.22,0,315106,32029 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,1222.29,1920,284.128,3396.46,61.2653,6756,172 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,1273.18,1950.75,305.145,3399.54,0,314490,31583 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,1252.64,1869.1,266.854,1477.86,33.4806,5702,478 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,1219.19,1855.42,302.62,1503.69,0,147768,11695 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,1190.85,1976.92,331.079,1648.09,41.9786,6061,172 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,1238.49,1853.81,279.515,1431.37,0,147540,11302 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,1280.93,1869.98,366.512,322.316,11.0774,5487,478 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,1137.45,1754.38,264.683,284.171,0,25538,3404 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,1152.76,1773.71,262.786,286.108,10.2654,5836,172 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,1136.65,1870.08,248.834,285.844,0,25742,3149 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,1123.92,1738.98,247.793,186.474,8.96673,0,2764 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1198.62,1837.25,262.961,202,0,298,52030 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,1107.49,1719.93,244.682,186.376,8.39929,0,1959 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,1185.12,1809.28,244.711,196.961,0,298,52637 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,1119.2,1725.64,237.594,127.964,7.24247,5005,478 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,1130.31,1735.11,245.407,122.864,0,6953,2484 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,1149.12,1760.43,241.127,129.149,7.1985,5336,172 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,1079.38,1709.02,241.849,117.925,0,7180,2211 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1301.58,2163.63,368.928,222.438,0,298,52217 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,1118.84,1741.4,285.376,7746.16,113.068,7103,154 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,1144.55,1764.23,270.079,8023.76,0,525574,73332 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,1161.52,1789.42,270.583,9020.05,130.568,7382,154 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,1195.64,1849.54,303.424,8893.18,0,521259,72394 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,1175.35,1861.26,295.849,3406.37,63.2159,6499,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,1205.8,1939.56,262.098,3471.62,0,312047,31103 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,1155.33,1761.74,257.97,3011.85,58.2738,6335,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,1207.61,1853.37,253.524,3168.34,0,310971,30643 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,1149.51,1767.3,248.628,1354.25,32.6771,5789,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,1186.56,1828.07,255.645,1435.23,0,144941,11084 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,1193.54,1817.34,249.68,1420.82,32.9605,5624,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,1173.8,1822.78,265.152,1430.22,0,144124,10634 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,1167.32,1773.89,248.039,273.39,9.6823,5573,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,1156.8,1888.85,330.842,293.511,0,23294,2417 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,1211.65,2053.25,350.924,300.892,10.3833,5398,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,1217.38,1852.89,269.338,272.204,0,22758,2142 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,1152.48,1783.94,273.103,191.392,8.47948,0,2233 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1147.74,1806.4,268.174,197.1,0,298,52217 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,1132.37,1769.65,284.116,193.452,10.5667,0,2499 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,1231.13,1882.52,269.057,199.144,0,298,51088 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,1164.66,1767.48,270.478,125.169,6.51384,5076,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,1128.52,1771.37,348.037,121.904,0,5967,1732 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,1245.87,1900.8,284.205,123.48,6.64354,4904,153 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,1190.53,1814.45,265.941,108.35,0,5412,1466 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,369.196,1130.89,240.681,5347.68,79.1541,4897,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,370.103,1110.48,217.76,5461.91,0,307553,53479 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,390.888,1126.82,213.156,5326.23,84.1707,5061,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,364.473,1068.24,199.103,5507.18,0,305736,52997 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,395.096,1154.67,178.822,2235.38,48.929,4636,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,335.157,994.656,132.263,1932.11,0,201911,20528 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,325.114,942.691,133.291,1916.34,44.3508,4679,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,353.597,979.619,150.706,2078.34,0,201518,20423 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,350.133,980.558,141.927,1351.27,33.3041,3855,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,416.959,1090.24,187.732,1172.45,0,99282,6374 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,343.394,1018.68,173.083,1001.95,23.2435,3842,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,348.539,1007.23,156.05,1076.86,0,98849,6236 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,369.17,1025.38,196.532,188.573,6.35762,3406,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,315.176,970.604,197.169,193.359,0,17077,1180 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,331.102,989.719,165.18,176.651,7.10374,3399,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,345.19,1170.48,287.932,206.732,0,16972,1130 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,360.264,1124.02,205.661,100.691,3.00067,0,744 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,333.938,993.624,167.747,98.4587,0,200,22686 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,342.206,997.067,157.278,97.1346,3.165,0,785 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,337.03,1026.49,179.884,97.0274,0,200,22547 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,325.013,979.388,165.127,64.723,4.23475,3135,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,356.321,1007.08,131.226,56.9022,0,3447,795 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,328,911.844,128.183,57.1585,4.14329,3114,82 -/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,319.621,903.721,130.917,53.0961,0,3331,753 diff --git a/benchmark_poly.csv b/benchmark_poly.csv index 107e49ae..3602517b 100644 --- a/benchmark_poly.csv +++ b/benchmark_poly.csv @@ -72,3 +72,13 @@ filename_polyadic,mining_supp,reduction,reclassify,isFilenamePolyadic,cpp_prepro /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,356.321,1007.08,131.226,56.9022,0 /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,328,911.844,128.183,57.1585,4.14329 /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,319.621,903.721,130.917,53.0961,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,300.86,800.527,117.078,20082.6,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,328.465,840.729,140.021,23425,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,293.664,774.624,116.497,19526.1,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,2886.19,9413,813.953,106920,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,2830.63,9298.7,568.097,110897,595.166 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,305.666,884.449,135.671,22015.3,175.412 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,331.926,843.255,136.441,21827.9,247.775 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,322.554,931.787,212.557,22430.9,0 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,0,308.326,856.088,150.196,23294.9,172.628 +/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,0,303.417,828.267,130.524,22808.3,0 diff --git a/expand_elements.sh b/expand_elements.sh new file mode 100755 index 00000000..31ec4bfe --- /dev/null +++ b/expand_elements.sh @@ -0,0 +1,3 @@ +find /home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/ -maxdepth 1 -mindepth 1 -type d |while read fname; do + ./cmake-build-release/knobab_json -f "$fname" -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json; +done \ No newline at end of file diff --git a/include/knobab/mining/polyadic/polyadic_bolt.h b/include/knobab/mining/polyadic/polyadic_bolt.h index 4651d456..5cb8cd36 100644 --- a/include/knobab/mining/polyadic/polyadic_bolt.h +++ b/include/knobab/mining/polyadic/polyadic_bolt.h @@ -107,6 +107,11 @@ struct result_container { if ((val == 1) || (val == -2)) val = 0; } + for (size_t i = 0; i& l, const pattern_mining_result& r) { return std::tie(l.clause.casusu, l.clause.left, l.clause.right, l.clause.n) == std::tie(r.clause.casusu, r.clause.left, r.clause.right, r.clause.n); }), Phi.end()); - DEBUG_ASSERT(curr_size_Clauses == Phi.size()); +// DEBUG_ASSERT(curr_size_Clauses == Phi.size()); } inline void mdev(size_t i) { @@ -1011,6 +1016,7 @@ struct polyadic_bolt { } setKnowledgeBaseAndInit(ptr); std::vector actLabels; + std::unordered_set act_to_consider; std::unordered_map, std::vector> result_map; std::unordered_map> act_Labels; std::unordered_map> noact_Labels; @@ -1019,19 +1025,24 @@ struct polyadic_bolt { ssize_t trace_id = -1; size_t log_size = ptr->nTraces(); for (const auto& x : acts) { - size_t id = ptr->event_label_mapper.get(x); - actLabels.emplace_back(id); - auto a_beginend = kb->timed_dataless_exists(id); - auto& v = act_Labels[id]; - trace_id = -1; - while (a_beginend.first != a_beginend.second) { - if (trace_id != a_beginend.first->entry.id.parts.trace_id) { - trace_id = a_beginend.first->entry.id.parts.trace_id; - v.emplace_back(trace_id); + if(ptr->event_label_mapper.signed_get(x)>0) { + size_t id = ptr->event_label_mapper.get(x); + actLabels.emplace_back(id); + act_to_consider.insert(id); + auto a_beginend = kb->timed_dataless_exists(id); + auto& v = act_Labels[id]; + trace_id = -1; + while (a_beginend.first != a_beginend.second) { + if (trace_id != a_beginend.first->entry.id.parts.trace_id) { + trace_id = a_beginend.first->entry.id.parts.trace_id; + v.emplace_back(trace_id); + } + a_beginend.first++; } - a_beginend.first++; + set_complement(log_size, v.begin(), v.end(), std::back_inserter(noact_Labels[id])); + } else { +// std::cerr << x << std::endl; } - set_complement(log_size, v.begin(), v.end(), std::back_inserter(noact_Labels[id])); } // remove_duplicates(actLabels); FastDatalessClause clause; @@ -1045,16 +1056,19 @@ struct polyadic_bolt { for (size_t trace_id = 0; trace_id < log_size; trace_id++) { const auto& first_last =kb->act_table_by_act_id.secondary_index.at(trace_id); for (auto it = first_last.first->begin(), en = first_last.first->end(); it!=en; it++) { - first[it->first].emplace_back(trace_id); + if (act_to_consider.contains(it->first)) + first[it->first].emplace_back(trace_id); } for (auto it = first_last.second->begin(), en = first_last.second->end(); it!=en; it++) { - last[it->first].emplace_back(trace_id); + if (act_to_consider.contains(it->first)) + last[it->first].emplace_back(trace_id); } } std::cout << "First..." << std::endl; std::tuple simplistic_clause{"Init","","§1"}; for (const auto& [act_id, traces] : first) { all_VIOL.clear(); + if (!act_to_consider.contains(act_id)) continue; std::get<1>(simplistic_clause) = ptr->event_label_mapper.get(act_id); auto& v = result_map[simplistic_clause]; v.resize(log_size, -1); @@ -1066,6 +1080,7 @@ struct polyadic_bolt { std::cout << "Last..." << std::endl; for (const auto& [act_id, traces] : last) { all_VIOL.clear(); + if (!act_to_consider.contains(act_id)) continue; std::get<1>(simplistic_clause) = ptr->event_label_mapper.get(act_id); auto& v = result_map[simplistic_clause]; v.resize(log_size, -1); @@ -1078,10 +1093,15 @@ struct polyadic_bolt { std::get<0>(simplistic_clause) = "Exists"; std::cout << "Exists..." << std::endl; for (const auto& [act_id, countings] : exists) { + if(ptr->event_label_mapper.signed_get(act_id)<0) { + continue; + } + if (!act_to_consider.contains(ptr->event_label_mapper.get(act_id))) continue; std::get<1>(simplistic_clause) = act_id; auto indexes = ptr->resolveCountingData(act_id); if ((indexes.first == indexes.second) && (indexes.first == (uint32_t)-1)) { - exit(5); + continue; +// exit(5); } else { std::unordered_map MAP; for (size_t count : countings) { @@ -1102,10 +1122,15 @@ struct polyadic_bolt { std::get<0>(simplistic_clause) = "Absence"; std::cout << "Absence..." << std::endl; for (const auto& [act_id, countings] : absence) { + if(ptr->event_label_mapper.signed_get(act_id)<0) { + continue; + } + if (!act_to_consider.contains(ptr->event_label_mapper.get(act_id))) continue; std::get<1>(simplistic_clause) = act_id; auto indexes = ptr->resolveCountingData(act_id); if ((indexes.first == indexes.second) && (indexes.first == (uint32_t)-1)) { - exit(4); + continue; +// exit(4); } else { std::unordered_map MAP; for (size_t count : countings) { diff --git a/new_pipeline.cpp b/new_pipeline.cpp index b2f002f7..1d854add 100644 --- a/new_pipeline.cpp +++ b/new_pipeline.cpp @@ -102,6 +102,7 @@ struct benchmarking { #include int main(int argc, char **argv) { + // Phases 01 (mining the models from the data) and 03 (deriving the decision tree structure) // CyberSecurity configuration: // -s 0.8 --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab /home/giacomo/Scaricati/classes/Adware.tab_100.tab /home/giacomo/Scaricati/classes/Backdoor.tab_100.tab /home/giacomo/Scaricati/classes/Downloader.tab_100.tab /home/giacomo/Scaricati/classes/Dropper.tab_100.tab /home/giacomo/Scaricati/classes/Spyware.tab_100.tab /home/giacomo/Scaricati/classes/Trojan.tab_100.tab /home/giacomo/Scaricati/classes/Virus.tab_100.tab /home/giacomo/Scaricati/classes/Worms.tab_100.tab diff --git a/poly_bench.sh b/poly_bench.sh index dffd92c5..19a25294 100755 --- a/poly_bench.sh +++ b/poly_bench.sh @@ -9,4 +9,8 @@ done ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json -./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json \ No newline at end of file +./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json +./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json +./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json +./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json +./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json \ No newline at end of file diff --git a/polyadic_preprocessing/02_crawl_single_model.py b/polyadic_preprocessing/02_crawl_single_model.py new file mode 100644 index 00000000..ab9d4b84 --- /dev/null +++ b/polyadic_preprocessing/02_crawl_single_model.py @@ -0,0 +1,196 @@ +import glob +import os.path +import sys +from collections import defaultdict +from pathlib import Path + +from utils import ForParsing + + +class ProcessClasses: + def __init__(self): + self.d_exists = dict() + self.d_absences = dict() + self.acts = set() + self.files = set() + + def dump(self, folder, rem=None): + from pathlib import Path + if isinstance(folder, str): + path = Path(folder) + if isinstance(folder, Path): + path = folder + else: + path = Path(folder) + print("Writing acts....") + if rem is not None: + if rem in self.acts: + self.acts.remove(rem) + if rem in self.d_exists: + self.d_exists.pop(rem) + if rem in self.d_absences: + self.d_absences.pop(rem) + with open(os.path.join(path.absolute(), "acts.txt"), "w") as fp: + fp.write(os.linesep.join(self.acts)) + with open(os.path.join(path.absolute(), "exists.txt"), "w") as fp: + fp.write(os.linesep.join( + [" ".join([str(len(self.d_exists[act]))] + [str(x) for x in self.d_exists[act]] + [act]) for act in self.d_exists])) + with open(os.path.join(path.absolute(), "absences.txt"), "w") as fp: + fp.write(os.linesep.join( + [" ".join([str(len(self.d_exists[act]))] + [str(x) for x in self.d_exists[act]] + [act]) for act in self.d_absences])) + + def process(self, file): + with open(file, "r") as f: + self.files.add(file) + for line in f.read().splitlines(): + if line.startswith("Exists"): + firstOpen = line.find('(') + lastClose = line.rfind(')') + lastPar = line.rfind('§') + act = line[firstOpen + 1:lastPar - 1].strip() + self.acts.add(act) + if act not in self.d_exists: + self.d_exists[act] = set() + self.d_exists[act].add(int(line[lastPar + 1:lastClose])) + elif line.startswith("Absence"): + firstOpen = line.find('(') + lastClose = line.rfind(')') + lastPar = line.rfind('§') + act = line[firstOpen + 1:lastPar - 1].strip() + self.acts.add(act) + if act not in self.d_absences: + self.d_absences[act] = set() + self.d_absences[act].add(int(line[lastPar + 1:lastClose])) + elif line.startswith("Init") or line.startswith("End"): + firstOpen = line.find('(') + lastClose = line.rfind(')') + lastPar = line.rfind('§') + act = line[firstOpen + 1:lastPar - 1].strip() + self.acts.add(act) + else: + firstOpen = line.find('(') + lastClose = line.rfind(')') + comma = line.find(',') + comma2 = line.rfind(',') + if (comma == comma2): + act = line[firstOpen + 1:comma].strip() + self.acts.add(act) + act = line[comma + 1:lastClose].strip() + self.acts.add(act) + else: + print("PARSING ERROR") + sys.exit(1) + + +file = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/log_weekly.json_1_1_1_0_clazz=1.txt" + +def old(): + pc = ProcessClasses() + pc.process(file) + from pathlib import Path + pc.dump(Path(file).parent.absolute()) + + + +def neu(): + filename_fileds = ["mining_supp", "reduction", "isFilenamePolyadic", "reclassify"] + S = set(["Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence", + "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"]) + root_dir = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/" + untimed = set(["Choice", "RespExistence", "CoExistence", "Choice", "ExclChoice"]) + timed = S.difference(untimed) + # desirGlobal = dict() + + # root_dir needs a trailing slash (i.e. /root/dir/) + L = [] + L2 = [] + for filename in glob.iglob(root_dir + '**/*.txt', recursive=True): + with open(filename, 'r') as f: + stem = Path(filename).stem.split("_") + clazz = stem[-1].replace("clazz=", "") + stem = stem[:-1] + d = dict(zip(filename_fileds, stem[-len(filename_fileds):])) + dtmp = dict() + d["class"] = clazz + d["filename"] = "_".join(stem[:-len(filename_fileds)]) + key = tuple(stem[-len(filename_fileds):]) + if "isFilenamePolyadic" in d: + d.pop("isFilenamePolyadic") + d.pop("reclassify") + d.pop("class") + d.pop("filename") + for i in range(3): + minsupp_reduction_conf[d["mining_supp"]][d["reduction"]][i] = "ciao" + + +minsupp_reduction_conf = defaultdict(lambda : defaultdict(lambda : defaultdict(ProcessClasses))) +def yi(file, key, d): + if "isFilenamePolyadic" in d: + d.pop("reclassify") + d.pop("class") + d.pop("filename") + minsupp_reduction_conf[d["isFilenamePolyadic"]][d["mining_supp"]][d["reduction"]].process(file) + +path = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/" +fp = ForParsing(path) +fp.yielder(yi) +for poly, inner in minsupp_reduction_conf.items(): + NP = "nopoly" if (int(poly)==0 or (not poly)) else "poly" + for supp, values in inner.items(): + for red, obj in values.items(): + p = Path(os.path.join(path, f"{NP}_s{supp}_{red}")) + p.mkdir(parents=True, exist_ok=True) + obj.dump(p, "__missing") + +# neu() +# print(minsupp_reduction_conf) +# with open(file, "r") as f: +# for line in f.read().splitlines(): +# if line.startswith("Exists"): +# firstOpen = line.find('(') +# lastClose = line.rfind(')') +# lastPar = line.rfind('§') +# act = line[firstOpen + 1:lastPar - 1].strip() +# acts.add(act) +# if act not in d_exists: +# d_exists[act] = set() +# d_exists[act].add(int(line[lastPar+1:lastClose])) +# elif line.startswith("Absence"): +# firstOpen = line.find('(') +# lastClose = line.rfind(')') +# lastPar = line.rfind('§') +# act = line[firstOpen + 1:lastPar - 1].strip() +# acts.add(act) +# if act not in d_absences: +# d_absences[act] = set() +# d_absences[act].add(int(line[lastPar+1:lastClose])) +# elif line.startswith("Init") or line.startswith("End"): +# firstOpen = line.find('(') +# lastClose = line.rfind(')') +# lastPar = line.rfind('§') +# act = line[firstOpen + 1:lastPar - 1].strip() +# acts.add(act) +# else: +# firstOpen = line.find('(') +# lastClose = line.rfind(')') +# comma = line.find(',') +# comma2 = line.rfind(',') +# if (comma == comma2): +# act = line[firstOpen + 1:comma].strip() +# acts.add(act) +# act = line[comma +1:lastClose].strip() +# acts.add(act) +# else: +# sys.exit(1) +# +# +# path = Path(file) +# print("Writing acts....") +# with open(os.path.join(path.parent.absolute(), "acts.txt"), "w") as fp: +# fp.write(os.linesep.join(acts)) +# +# with open(os.path.join(path.parent.absolute(), "exists.txt"), "w") as fp: +# fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_exists])) +# +# with open(os.path.join(path.parent.absolute(), "absences.txt"), "w") as fp: +# fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_absences])) \ No newline at end of file diff --git a/polyadic_preprocessing/04_learn_from_cpp_csvs.py b/polyadic_preprocessing/04_learn_from_cpp_csvs.py new file mode 100644 index 00000000..e975ba9b --- /dev/null +++ b/polyadic_preprocessing/04_learn_from_cpp_csvs.py @@ -0,0 +1,126 @@ +import os +import sys +from collections import defaultdict + +import pandas +from sklearn.metrics import accuracy_score, precision_score +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeClassifier + +from utils import export_text2 + +from pathlib import Path +folder = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/nopoly_s0_1/" +class0 = os.path.join(folder, "output_csv_0.csv") +class1 = os.path.join(folder, "output_csv_1.csv") +spec = None#"/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt" +modelfile = Path(folder).name + ".txt" + +def loadDataset(class0, class1): + df0 = pandas.read_csv(class0, index_col=0, header=None).transpose() + df0['class'] = 0 + df1 = pandas.read_csv(class1, index_col=0, header=None).transpose() + df1['class'] = 1 + return pandas.concat([df0, df1], axis=0, ignore_index=True).fillna(-1) + +def readFileForSpec(filename): + S = set() + with open(filename, "r") as f: + for line in f.readlines(): + S.add(line) + coex = line.find("CoExistence(") + cho = line.find("Choice(") + excl = line.find("ExclChoice(") + firstOpen = line.find('(') + lastClose = line.rfind(')') + if (coex==0) or cho==0 or excl==0: + comma = line.find(',') + comma2 = line.rfind(',') + if (comma == comma2): + act1 = line[firstOpen + 1:comma].strip() + act2 = line[comma + 1:lastClose].strip() + if (coex==0): + S.add("CoExistence("+act2+","+act1+")") + elif (cho==0): + S.add("Choice("+act2+","+act1+")") + elif (excl==1): + S.add("ExclChoice("+act2+","+act1+")") + else: + sys.exit(1) + return S + +class LearnRepresentation: + def __init__(self, folder, spec=None): + self.class0 = os.path.join(folder, "output_csv_0.csv") + self.class1 = os.path.join(folder, "output_csv_1.csv") + self.spec = spec # "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt" + self.modelfile = Path(folder).name + ".txt" + self.dict_list = loadDataset(class0, class1) + if spec is not None: + S = readFileForSpec(spec) + dict_list = self.dict_list[list(set(self.dict_list.columns).intersection(S)) + ["class"]] + + def test(self, poly, supp, red, outcomes): + if self.dict_list.empty or (len(set(self.dict_list.columns)) == 1 and ("class" in set(self.dict_list.columns))): + print("No data") + else: + X = self.dict_list.drop(labels=['class'], axis=1) + y = self.dict_list['class'] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) + rf = DecisionTreeClassifier(criterion="gini", max_depth=5) + rf.fit(X_train, y_train) + y_pred = rf.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred) + print(f"poly: {poly}\tsupp: {supp}\tred: {red}\tacc: {accuracy}\tprec: {precision}") + outcomes.append({"poly": poly, "supp": supp, "red": red, "accuracy": accuracy, "precision": precision, "model":os.linesep.join(export_text2(rf, X.columns, show_weights=True))}) + # print("Accuracy:", accuracy) + # with open(modelfile, "w") as file: + # file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True))) + # file.write(os.linesep + ("Accuracy: ") + str(accuracy)) + +def genfolder(poly, s, red): + poly = "poly" if (poly or (int(poly) == 1)) else "nopoly" + red = 1 if (red or (int(red) == 1)) else 0 + return f"{poly}_s{s}_{red}" + +outcomes = [] +path = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/" +# name = genfolder(1, 1.0, 1) +# abs_folder = os.path.join(path, name) +# lr = LearnRepresentation(abs_folder) +# for _ in range(20): + # lr.test(1, 1.0, 1, outcomes) +for name in os.listdir(path): + abs_folder = os.path.join(path, name) + if os.path.isdir(abs_folder): + print(name) + lr = LearnRepresentation(abs_folder) + arr = name.split("_") + arr[0] = 0 if arr[0] == "nopoly" else 1 + arr[1] = float(arr[1][1:]) + for _ in range(20): + lr.test(arr[0], arr[1], int(arr[2])==1, outcomes) +pandas.DataFrame(outcomes).to_csv("results_proposed.csv",index=False) + + + + + + +# if dict_list.empty or (len(set(dict_list.columns)) == 1 and ("class" in set(dict_list.columns))): +# print("No data") +# with open(modelfile, "w") as file: +# file.write("No data") +# else: +# X = dict_list.drop(labels=['class'], axis=1) +# y = dict_list['class'] +# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) +# rf = DecisionTreeClassifier(criterion="entropy") +# rf.fit(X_train, y_train) +# y_pred = rf.predict(X_test) +# accuracy = accuracy_score(y_test, y_pred) +# print("Accuracy:", accuracy) +# with open(modelfile, "w") as file: +# file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True))) +# file.write(os.linesep + ("Accuracy: ") + str(accuracy)) \ No newline at end of file diff --git a/polyadic_preprocessing/__pycache__/utils.cpython-310.pyc b/polyadic_preprocessing/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90d517f225fd7c8170ab2ab808a0a46bd5f8ea8d GIT binary patch literal 5923 zcma)A&2JpJl`qy;SGUx%?Af))!6aMj>}1r2(voM9k5MdbW^8$5AgeJfHj6!$+TBR7^1|{VSWwZyB%-eV-~-} zUG6<&`#$%1>6x`(`ge;5y!_1KkJ|$_uny_9@j7^2?);{N zzTBPU|4No6T`<>5+C335#CP&cwY9_J6YaM1p45IP$$Cm?zYvW)<9eZ!G-N(1)I1&V zSP7MMgbW}@k_wV+C>Kbqt=X|={EwErgyL7*Ywf%v)>=v2$UFI3SLTPJp^CMmEI)`3 zK8(`5)e_lSH&0Juo;0Ga6yRwH3_feE^;D7;_3nxGpvdPrbhip0$yMeu|Fyh~Y=b6f z^3cGjY)tYKWRx|qPT7#YHoh>(fL*hOFv`F=V)8wFZ5K_BSbm)-yOwP+YZt8s_7UrT zdti^KHP-*`L!o*y3zh7NFlmOzVN!$%%54$KIBSXUvqHsEt%f4wk7~?lS97&zsM_Nq zMRgTQX$Ks}JfmcpXjQUklw^^SMW!qkkpyn62^vvII9AmEB&$fD+v<+{ujq zQ&GLsj+1QKzDl`S?Ilw?Cuc;M3ELvX%5PHg&y@TNCEr0(^Yx|XcyJMAc%cKC=PKff z)NYCaXtt}JLMUD8Wdt6MwdocjZOWS@rAEmrC3Q+@HO|2;h=8?3MbS0Ti*F$Dna?V0 zkuBQeuS4G@;I9A93^esgy9=&}mjBch{RsuQAuF|pHo2peb?l+T0H}#Gx@Hk7e`DXZ z9vuu_rvIT&Y>P~5`2ZPce;0w1cUOKN*yBQbpykw|dylttax3p)&GJx+6%7fq( zn*RcFUHmAY1-Go?@e{`2Cz$o0cpv`*LAO@jk>sLlOuG+a)$Rv3Z?4^}t7EkX5>VUd zVby&uG?SV|uU_Nn`+#b{zo{+wHv+8=CR1L!q}^7UALvpSveM*0-hyoHB}6pytKGaS zGEBG<9r&1T3V@g3JkuC}cHv5ucKS)zz(6}#CE7jAlT3T6*G)yOBG*Wu-E7K_kTK%r z4~fK@a?|u>l#`=HbR$yo!WgvpWP9 zaQu#;y&~`UihSF>3*L@)5_+k+&`1Cv>qQAEdjasifct!bU32K2+1%|hc|+M}0M%LP z&^dBY0(>byqBiZT@Phxv25`IBR8~=28jNXY3^YrAAGF|%jrag~O9Alm6J7?y-Qr_j z8Ty&c7l!`0zuo6{lkr7feHNgf(p|`Z;bZ|{0>zq9o?Fk^bLYAH+LA=v7K zs8=c>o}F9?@Wm=Th#N;wV#$jTJ37#r93;TrN!W=`!YoI<2+2`HzTo4=*l;ktO31>< z`tI~G#YuN2%n3Kb{D49PQ$;=RVSk;l7>z|vm!Zn3J8?Yejri%gufj&0Q617G z!ww8LkHJlazCF|mA#%Rvp=e1&BDoxoqbXv90K$N^fTrYy$z+9+tI)9-_foaFlV@Vu z&&N|9tiYCPhgmchLgvd9_KwK}jOiiay{Jzm+;776nFtXj37IrT)!~yQO=)<&Lhx|q z#ze988>`{8a0`Wcy5)XuWB19Gn-h!zNLfT2I4W2|HDqA%}c#^<5ji4I~OSl;OF z!?$!d<_&q$N>o8;;|V-%d@|7->nD05&^2l|!h<|dXJ);JPG&&BG@8JX%r_2D64(i0 zOgIrRvC{<3A?ifjXosnYn=?SrdiX06XiBJqY-X^y18-l9?_#>aPwY3#CW!?#k zBMebybsQ{W1T+Y%JH|hydCa3kCL=Y!#%^FZW}RQQccy12)9K%AH!^laUUAAyJ%*d% zwD*4JUrX_?*qrz%I}}cr=sM1t8Rl91#YX z%CIX`oMJ(9tmu_SrxxDc4A<*F9Ql(0{IKCNu-gd7)VH?V3g6Q1ZPIVmfABqHb0htY z9&XKQ{#q}8v;D{g^cSBFL;Nw8r^Cvm1P!U=o4U$Qzkb0?VW$UoId=!i_g z6}7M>zRyi)*#G9l)fslU@2wCM-9l8Dfv5jZG&{yOP3vLLIflzmhvLcsH=I%ETeO9O_Grh0~>-gPTO-2R3A*hbjE za3>)tKGI8j54X3YpY7b;zIT6T`|clYZP(W|OAyyqFPLHOd{Mbgn4e~0 zHoNlL#>u@u56Tbgx6`}^ana$tEiID-(U54xydj z%q0Q^`2bDk!bTG@?t%6N$s^lby?Bj4%2IS_%yj@mmaQ}Y5GK2BDRyYXT>Ta<5zcI) z3|-XWdm*%9mA)R_GO{L=iH76Z) zBk2Rgjocar=zoD%j%@h?IUKUCAG*Xjb`(Rlj1j z(E{=i&+*XQU-c{D=2rOA_&}taeLq}TUt6!$uISPj00z18QxZ{DZA5YHJmvCdD9e8% zB5q|QPDh_m-iPzYjf}1~+K7>*$f8stQZj{#KaZd z!mNbT;lw}ar3uW9@x#g34q!#_ySyY7i>P=0X|I?-H?%t9{P+&!A>LURf>GUpuN^xUk2+oR+57VQ?r6(j+! S9G6+e3EZUx`+b`&z5WXpbVVQl literal 0 HcmV?d00001 diff --git a/polyadic_preprocessing/crawl_single_model.py b/polyadic_preprocessing/crawl_single_model.py deleted file mode 100644 index 30951ba3..00000000 --- a/polyadic_preprocessing/crawl_single_model.py +++ /dev/null @@ -1,59 +0,0 @@ -import os.path -import sys - -file = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/log_weekly.json_1_1_1_0_clazz=1.txt" - -d_exists = dict() -d_absences = dict() -acts = set() - -with open(file, "r") as f: - for line in f.read().splitlines(): - if line.startswith("Exists"): - firstOpen = line.find('(') - lastClose = line.rfind(')') - lastPar = line.rfind('§') - act = line[firstOpen + 1:lastPar - 1].strip() - acts.add(act) - if act not in d_exists: - d_exists[act] = set() - d_exists[act].add(int(line[lastPar+1:lastClose])) - elif line.startswith("Absence"): - firstOpen = line.find('(') - lastClose = line.rfind(')') - lastPar = line.rfind('§') - act = line[firstOpen + 1:lastPar - 1].strip() - acts.add(act) - if act not in d_absences: - d_absences[act] = set() - d_absences[act].add(int(line[lastPar+1:lastClose])) - elif line.startswith("Init") or line.startswith("End"): - firstOpen = line.find('(') - lastClose = line.rfind(')') - lastPar = line.rfind('§') - act = line[firstOpen + 1:lastPar - 1].strip() - acts.add(act) - else: - firstOpen = line.find('(') - lastClose = line.rfind(')') - comma = line.find(',') - comma2 = line.rfind(',') - if (comma == comma2): - act = line[firstOpen + 1:comma].strip() - acts.add(act) - act = line[comma +1:lastClose].strip() - acts.add(act) - else: - sys.exit(1) - -from pathlib import Path -path = Path(file) -print("Writing acts....") -with open(os.path.join(path.parent.absolute(), "acts.txt"), "w") as fp: - fp.write(os.linesep.join(acts)) - -with open(os.path.join(path.parent.absolute(), "exists.txt"), "w") as fp: - fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_exists])) - -with open(os.path.join(path.parent.absolute(), "absences.txt"), "w") as fp: - fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_absences])) \ No newline at end of file diff --git a/polyadic_preprocessing/external/__init__.py b/polyadic_preprocessing/external/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/polyadic_preprocessing/external/rocket_training.py b/polyadic_preprocessing/external/rocket_training.py new file mode 100644 index 00000000..1ea82a4f --- /dev/null +++ b/polyadic_preprocessing/external/rocket_training.py @@ -0,0 +1,9 @@ +import numpy +from sktime.classification.kernel_based import RocketClassifier + +def train(X_train, y_train, X_test, y_test): + clf = RocketClassifier(num_kernels=500, use_multivariate="yes") + clf.fit(X_train, y_train) + acc = numpy.sum(clf.predict(X_test) - y_test)/len(y_test) + print(acc) + diff --git a/polyadic_preprocessing/external/windowshap/README.md b/polyadic_preprocessing/external/windowshap/README.md new file mode 100644 index 00000000..4ed91a88 --- /dev/null +++ b/polyadic_preprocessing/external/windowshap/README.md @@ -0,0 +1,17 @@ +# WindowSHAP +## Introduction + +When working with time-series predictive models, it's crucial to have an explainability method that is suitable for time-series data, computationally efficient, and capable of handling dependencies between sequential data points. WindowSHAP is a framework specifically designed for this purpose.This repository includes the implementation of WindowSHAP framework, an effective explanation method for time-series classifiers. For more information about the method, please refer to the [original paper](https://arxiv.org/abs/2211.06507). + +WindowSHAP enhances the explainability of time-series prediction models by reducing the total number of features for which Shapley values must be determined. It does so by combining neighboring time steps into a time window. The framework offers various types of time windows, each with its own advantages. The main contributions of WindowSHAP are: + +- Adapting Shapley additive explanations for time-series data in a very efficient way. +- Introducing variations of WindowSHAP based on different windowing techniques, both for fixed- and variable-length time windows. +The following figure shows how increasing the window length in the WindowSHAP framework can reduce the runtime of the algorithm exponentially. + +![Runtime](RunTime.png) + +This tutorial demonstrates how to use the WindowSHAP framework, which provides three distinct algorithms: Stationary, Sliding, and Dynamic WindowSHAP. These algorithms help explain time-series classifiers using Shapley values. + +## How to Use WindowSHAP +To use the WindowSHAP framework, one just need to include the `windowshap.py` file into their working directory. There are three different algorithms included in the package; Stationary WindowSHAP, Sliding WindowSHAP, and Dynamic WindowSHAP. For each of the afored mentioned algorithms, a class is defined in the `windowshap.py`. diff --git a/polyadic_preprocessing/external/windowshap/__init__.py b/polyadic_preprocessing/external/windowshap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/polyadic_preprocessing/external/windowshap/windowshap.py b/polyadic_preprocessing/external/windowshap/windowshap.py new file mode 100644 index 00000000..adaf2252 --- /dev/null +++ b/polyadic_preprocessing/external/windowshap/windowshap.py @@ -0,0 +1,444 @@ +import math +import numpy as np +import shap +from copy import deepcopy +import warnings +warnings.filterwarnings('ignore') + +class StationaryWindowSHAP(): + ''' + A class for computing the shapely values for time sereis data. Only the shap values for the first output + is reported. + + Parameters: + model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input + window_len: The length of the window for the algorithm + B_ts: A 3D numpy array of background time series data + test_ts: A 3D numpy array of test time series data + B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None) + B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None) + test_mask: A 3D numpy array of test mask data (default: None) + test_dem: A 2D numpy array of test demographic data (default: None) + model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure. (default: 'lstm') + ''' + def __init__(self, model, window_len, B_ts, test_ts, B_mask=None, B_dem=None, + test_mask=None, test_dem=None, model_type='lstm'): + self.model = model + self.window_len = window_len + self.num_window = np.ceil(B_ts.shape[1]/self.window_len).astype('int') + self.num_background = len(B_ts) + self.num_test = len(test_ts) + self.background_ts = B_ts + self.background_mask = B_mask + self.background_dem = B_dem + self.test_ts = test_ts + self.test_mask = test_mask + self.test_dem = test_dem + self.model_type = model_type + self.ts_phi = None + self.dem_phi = None + self.explainer = None + + # Problem sizes + self.num_ts_ftr = B_ts.shape[2] + self.num_ts_step = B_ts.shape[1] + self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1] + + # Creating all data (background and test together) + self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0) + self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0) + self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0) + + # Creating converted data for SHAP + self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0) + self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background) + + + def data_prepare(self, ts_x, dem_x=None, start_idx=0): + assert len(ts_x.shape) == 3 + assert dem_x is None or len(dem_x.shape) == 2 + dem_len = 0 if dem_x is None else dem_x.shape[1] + + total_num_features = self.num_dem_ftr + self.num_ts_ftr * self.num_window + + x_ = [[i]*total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])] + + return np.array(x_) + + + def wraper_predict(self, x): + assert len(x.shape) == 2 + + dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy() + + # initializing the value of all arrays + ts_x_ = np.zeros((x.shape[0], self.all_ts.shape[1], self.all_ts.shape[2])) + mask_x_ = np.zeros_like(ts_x_) + dem_x_ = np.zeros_like(dem_x, dtype=float) + tstep = np.ones((x.shape[0], self.all_ts.shape[1], 1)) * \ + np.reshape(np.arange(0, self.all_ts.shape[1]), (1, self.all_ts.shape[1], 1)) + + # Reshaping the ts indices based on the num time windows and features + ts_x = ts_x.reshape((ts_x.shape[0], self.num_window, self.num_ts_ftr)) + + for i in range(x.shape[0]): + # creating time series data + for t in range(self.num_ts_step): + for j in range(self.num_ts_ftr): + # Finding the corresponding time interval + wind_t = np.ceil((t+1)/self.window_len).astype('int') - 1 + ind = ts_x[i, wind_t, j] + ts_x_[i, t, j] = self.all_ts[ind, t, j] + mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j] + # creating static data + for j in range(dem_x.shape[1]): + ind = dem_x[i,j] + dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j] + + # Creating the input of the model based on the different models. + # This part should be updated as new models get involved in the project + if self.model_type == 'lstm_dem': + model_input = [ts_x_, dem_x_] + elif self.model_type == 'grud': + model_input = [ts_x_, mask_x_, tstep] + elif self.model_type == 'lstm': + model_input = ts_x_ + + return self.model.predict(model_input) + + def shap_values(self, num_output=1): + self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data) + shap_values = self.explainer.shap_values(self.test_data) + shap_values = np.array(shap_values) + + self.dem_phi = shap_values[:, :, :self.num_dem_ftr] + ts_shap_values = shap_values[:, :, self.num_dem_ftr:] + self.ts_phi = ts_shap_values.reshape((num_output, self.num_test, self.num_window, self.num_ts_ftr)) + + # assign values to each single time step by deviding the values by window length + self.ts_phi = np.repeat(self.ts_phi/self.window_len, self.window_len, axis=2)[:,:,:self.num_ts_step,:] + + # Reporting only the first output + self.ts_phi = self.ts_phi[0] + self.dem_phi = self.dem_phi[0] + + return self.ts_phi if self.num_dem_ftr==0 else (self.dem_phi, self.ts_phi) + + +class SlidingWindowSHAP(): + ''' + A class for computing the shapely values for time sereis data. Only the shap values for the first output + is reported. + + Parameters: + model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input + stride: The stride parameter for the Sliding WindowSHAP algorithm + window_len: The length of the window for the algorithm + B_ts: A 3D numpy array of background time series data + test_ts: A 3D numpy array of test time series data + B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None) + B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None) + test_mask: A 3D numpy array of test mask data (default: None) + test_dem: A 2D numpy array of test demographic data (default: None) + model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure. (default: 'lstm') + ''' + def __init__(self, model, stride, window_len, B_ts, test_ts, B_mask=None, + B_dem=None, test_mask=None, test_dem=None, model_type='lstm'): + self.model = model + self.model_type = model_type + self.stride = stride + self.window_len = window_len + self.num_window = 2 #Specific to the sliding time window + self.num_background = len(B_ts) + self.num_test = len(test_ts) + self.background_ts = B_ts + self.background_mask = B_mask + self.background_dem = B_dem + self.test_ts = test_ts + self.test_mask = test_mask + self.test_dem = test_dem + self.ts_phi = None + self.dem_phi = None + self.explainer = None + + # Problem sizes + self.num_ts_ftr = B_ts.shape[2] + self.num_ts_step = B_ts.shape[1] + self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1] + + + # Creating all data (background and test together) + self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0) + self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0) + self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0) + + # Creating converted data for SHAP + self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0) + self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background) + + def data_prepare(self, ts_x, dem_x=None, start_idx=0): + # Modified for sliding time window + assert len(ts_x.shape) == 3 + assert dem_x is None or len(dem_x.shape) == 2 + + total_num_features = self.num_dem_ftr + self.num_ts_ftr * self.num_window + + x_ = [[i] * total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])] + + return np.array(x_) + + def wraper_predict(self, x, start_ind=0): + assert len(x.shape) == 2 + + # Calculating the indices inside the time window + inside_ind = list(range(start_ind, start_ind + self.window_len)) + + dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy() + + # initializing the value of all arrays + ts_x_ = np.zeros((x.shape[0], self.num_ts_step, self.num_ts_ftr)) + mask_x_ = np.zeros_like(ts_x_) + dem_x_ = np.zeros_like(dem_x, dtype=float) + tstep = np.ones((x.shape[0], self.num_ts_step, 1)) * \ + np.reshape(np.arange(0, self.num_ts_step), (1, self.num_ts_step, 1)) + + # Reshaping the ts indices based on the num time windows and features + ts_x = ts_x.reshape((ts_x.shape[0], self.num_window, self.num_ts_ftr)) + + for i in range(x.shape[0]): + # creating time series data + for t in range(self.num_ts_step): + for j in range(self.num_ts_ftr): + # Finding the corresponding time interval + wind_t = 0 if (t in inside_ind) else 1 + ind = ts_x[i, wind_t, j] + ts_x_[i, t, j] = self.all_ts[ind, t, j] + mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j] + # creating static data + for j in range(dem_x.shape[1]): + ind = dem_x[i,j] + dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j] + + # Creating the input of the model based on the different models. + # This part should be updated as new models get involved in the project + if self.model_type == 'lstm_dem': + model_input = [ts_x_, dem_x_] + elif self.model_type == 'grud': + model_input = [ts_x_, mask_x_, tstep] + elif self.model_type == 'lstm': + model_input = ts_x_ + + return self.model.predict(model_input) + + def shap_values(self, num_output=1, nsamples='auto'): + # Initializing number of time windows and contribution score matrices + seq_len = self.background_ts.shape[1] + num_sw = np.ceil((seq_len - self.window_len)/self.stride).astype('int') + 1 + ts_phi = np.zeros((self.num_test, num_sw, 2, self.background_ts.shape[2])) + dem_phi = np.zeros((self.num_test, num_sw, self.num_dem_ftr)) + + # Determining the number of samples + if nsamples=='auto': + nsamples = 10 * self.num_ts_ftr + 5 * self.num_dem_ftr + + # Main loop on different possible windows + for stride_cnt in range(num_sw): + + predict = lambda x: self.wraper_predict(x, start_ind=stride_cnt * self.stride) + + # Running SHAP + self.explainer = shap.KernelExplainer(predict, self.background_data) + shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples) + shap_values = np.array(shap_values) + + # Extracting the SHAP values and storing them + dem_shap_values_ = shap_values[:, :, :self.num_dem_ftr] + ts_shap_values = shap_values[:, :, self.num_dem_ftr:] + ts_shap_values = ts_shap_values.reshape((num_output, self.num_test, 2, self.num_ts_ftr)) + + ts_phi[:, stride_cnt, :, :] = ts_shap_values[0] + dem_phi[:, stride_cnt, :] = dem_shap_values_[0] + + # Averaging shap values from different windows + ts_phi_agg = np.empty((self.num_test, num_sw, self.num_ts_step, self.num_ts_ftr)) + ts_phi_agg[:] = np.nan + for k in range(num_sw): + ts_phi_agg[:,k, k * self.stride:k * self.stride + self.window_len, :] = ts_phi[:, k, 0, :][:, np.newaxis, :] + ts_phi_agg = np.nanmean(ts_phi_agg, axis=1) + dem_phi = np.nanmean(dem_phi, axis=1) + + self.dem_phi = dem_phi + self.ts_phi = ts_phi_agg + + return ts_phi_agg if self.num_dem_ftr==0 else (dem_phi, ts_phi_agg) + + +class DynamicWindowSHAP(): + ''' + A class for computing the shapely values for time sereis data. Only the shap values for the first output + is reported. + + Parameters: + model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input + delta: The treshold value in Dynamic WindowSHAP algorithm + n_w: The maximum allowed number of time windows for each variable + B_ts: A 3D numpy array of background time series data + test_ts: A 3D numpy array of test time series data + B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None) + B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None) + test_mask: A 3D numpy array of test mask data (default: None) + test_dem: A 2D numpy array of test demographic data (default: None) + model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure. (default: 'lstm') + ''' + def __init__(self, model, delta, n_w, B_ts, test_ts, B_mask=None, B_dem=None, + test_mask=None, test_dem=None, model_type='lstm'): + self.model = model + self.model_type = model_type + self.num_background = len(B_ts) + self.num_test = len(test_ts) + self.background_ts = B_ts + self.background_mask = B_mask + self.background_dem = B_dem + self.test_ts = test_ts + self.test_mask = test_mask + self.test_dem = test_dem + self.ts_phi = None + self.dem_phi = None + self.explainer = None + + # Problem sizes + self.num_ts_ftr = B_ts.shape[2] + self.num_ts_step = B_ts.shape[1] + self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1] + + ## Specific to Binary Time Window + assert self.num_test == 1 # For binary time window algorithm, samples should be fed to the algorithm one-by-one + self.delta = delta + self.n_w = n_w + self.split_points = [[self.num_ts_step - 1]] * self.num_ts_ftr # Splitting points + self.num_window = [1] * self.num_ts_ftr + + + # Creating all data (background and test together) + self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0) + self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0) + self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0) + + # Creating converted data for SHAP + self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0) + self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background) + + def data_prepare(self, ts_x, dem_x=None, start_idx=0): + assert len(ts_x.shape) == 3 + assert dem_x is None or len(dem_x.shape) == 2 + total_num_features = self.num_dem_ftr + sum(self.num_window) ## Specific to Binary Time Window + + x_ = [[i] * total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])] + + return np.array(x_) + + def wraper_predict(self, x): + assert len(x.shape) == 2 + + dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy() + + # initializing the value of all arrays + ts_x_ = np.zeros((x.shape[0], self.num_ts_step, self.num_ts_ftr)) + mask_x_ = np.zeros_like(ts_x_) + dem_x_ = np.zeros_like(dem_x, dtype=float) + tstep = np.ones((x.shape[0], self.num_ts_step, 1)) * \ + np.reshape(np.arange(0, self.num_ts_step), (1, self.num_ts_step, 1)) + + # Reshaping the ts indices based on the time windows for each feature + ## Specific to Binary Time Window + temp_ts_x = np.zeros((ts_x.shape[0], max(self.num_window), self.num_ts_ftr), dtype=int) + for i in range(self.num_ts_ftr): + temp_ts_x[:, :self.num_window[i], i] = ts_x[:, sum(self.num_window[:i]):sum(self.num_window[:i+1])] + ts_x = temp_ts_x + + for i in range(x.shape[0]): + # creating time series data + for j in range(self.num_ts_ftr): + # Finding the corresponding time interval + wind_t = np.searchsorted(self.split_points[j], np.arange(self.num_ts_step)) ## Specific to Binary Time Window + for t in range(self.num_ts_step): + ind = ts_x[i, wind_t[t], j] + ts_x_[i, t, j] = self.all_ts[ind, t, j] + mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j] + # creating static data + for j in range(dem_x.shape[1]): + ind = dem_x[i,j] + dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j] + + # Creating the input of the model based on the different models. + # This part should be updated as new models get involved in the project + if self.model_type == 'lstm_dem': + model_input = [ts_x_, dem_x_] + elif self.model_type == 'grud': + model_input = [ts_x_, mask_x_, tstep] + elif self.model_type == 'lstm': + model_input = ts_x_ + + return self.model.predict(model_input) + + def shap_values(self, num_output=1, nsamples_in_loop='auto', nsamples_final='auto'): + flag = 1 + while flag: + flag = 0 + # Updating the number of time windows for each time series feature + self.num_window = [len(self.split_points[i]) for i in range(self.num_ts_ftr)] + + # Updating converted data for SHAP + self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0) + self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background) + + # Running SHAP + if nsamples_in_loop == 'auto': + nsamples = 2 * sum(self.num_window) + else: + nsamples = nsamples_in_loop + + self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data) + shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples) + shap_values = np.array(shap_values) + dem_phi = shap_values[0, :, :self.num_dem_ftr] # Extracting dem SHAP values + ts_shap_values = shap_values[:, :, self.num_dem_ftr:] # Extracting ts SHAP values + + # Checking the maximum number of windows condition + if max(self.num_window) >= self.n_w: break + + for i in range(self.num_ts_ftr): + S = set(self.split_points[i]) + for j in range(self.num_window[i]): + if abs(ts_shap_values[0, 0, sum(self.num_window[:i]) + j]) > self.delta: + S.add(int(self.split_points[i][j]/2) if j == 0 else int((self.split_points[i][j-1] + self.split_points[i][j])/2)) + if set(S) != set(self.split_points[i]): + flag += 1 + self.split_points[i] = list(S) + self.split_points[i].sort() + + # Running SHAP with large number of samples for the final evaluation of Shapely values + self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data) + shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples_final) + shap_values = np.array(shap_values) + dem_phi = shap_values[0, :, :self.num_dem_ftr] # Extracting dem SHAP values + ts_shap_values = shap_values[:, :, self.num_dem_ftr:] # Extracting ts SHAP values + + # Assigning Shap values to each single time step + ts_phi = np.zeros((self.num_test, self.num_ts_step, self.num_ts_ftr)) + for i in range(self.num_ts_ftr): + for j in range(self.num_window[i]): + # This part of the code is written in a way that each splitting point belongs to the time window that starts from that point + # For the last time window, both splitting points at the end and start of the time window belong to it + start_ind = 0 if j==0 else self.split_points[i][j-1] + end_ind = self.split_points[i][j] + int((j + 1) / self.num_window[i]) + ts_phi[0, start_ind:end_ind, i] = ts_shap_values[0, :, sum(self.num_window[:i]) + j] / (end_ind - start_ind) + self.dem_phi = dem_phi + self.ts_phi = ts_phi + + return ts_phi if self.num_dem_ftr==0 else (dem_phi, ts_phi) + + + +if __name__=="__main__": + pass diff --git a/polyadic_preprocessing/learn_from_cpp_csvs.py b/polyadic_preprocessing/learn_from_cpp_csvs.py deleted file mode 100644 index 3f28abfd..00000000 --- a/polyadic_preprocessing/learn_from_cpp_csvs.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import sys - -import pandas -from sklearn.metrics import accuracy_score -from sklearn.model_selection import train_test_split -from sklearn.tree import DecisionTreeClassifier - -from utils import export_text2 - -class0 = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/output_csv_0.csv" -class1 = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/output_csv_1.csv" -spec = None#"/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt" -modelfile = "decl_nodata_100model_1-1.txt" - -def loadDataset(class0, class1): - df0 = pandas.read_csv(class0, index_col=0, header=None).transpose() - df0['class'] = 0 - df1 = pandas.read_csv(class1, index_col=0, header=None).transpose() - df1['class'] = 1 - return pandas.concat([df0, df1], axis=0, ignore_index=True).fillna(-1) - -def readFileForSpec(filename): - S = set() - with open(filename, "r") as f: - for line in f.readlines(): - S.add(line) - coex = line.find("CoExistence(") - cho = line.find("Choice(") - excl = line.find("ExclChoice(") - firstOpen = line.find('(') - lastClose = line.rfind(')') - if (coex==0) or cho==0 or excl==0: - comma = line.find(',') - comma2 = line.rfind(',') - if (comma == comma2): - act1 = line[firstOpen + 1:comma].strip() - act2 = line[comma + 1:lastClose].strip() - if (coex==0): - S.add("CoExistence("+act2+","+act1+")") - elif (cho==0): - S.add("Choice("+act2+","+act1+")") - elif (excl==1): - S.add("ExclChoice("+act2+","+act1+")") - else: - sys.exit(1) - return S - -dict_list = loadDataset(class0, class1) - -if spec is not None: - S = readFileForSpec(spec) - dict_list = dict_list[list(set(dict_list.columns).intersection(S))+["class"]] - - - -if dict_list.empty or (len(set(dict_list.columns)) == 1 and ("class" in set(dict_list.columns))): - print("No data") - with open(modelfile, "w") as file: - file.write("No data") -else: - X = dict_list.drop(labels=['class'], axis=1) - y = dict_list['class'] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) - rf = DecisionTreeClassifier(criterion="entropy") - rf.fit(X_train, y_train) - y_pred = rf.predict(X_test) - accuracy = accuracy_score(y_test, y_pred) - print("Accuracy:", accuracy) - with open(modelfile, "w") as file: - file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True))) - file.write(os.linesep + ("Accuracy: ") + str(accuracy)) \ No newline at end of file diff --git a/polyadic_preprocessing/log_json_to_numpy.py b/polyadic_preprocessing/log_json_to_numpy.py new file mode 100644 index 00000000..7694e8a4 --- /dev/null +++ b/polyadic_preprocessing/log_json_to_numpy.py @@ -0,0 +1,145 @@ +import json +import os +import sys +from collections import defaultdict + +import pandas +from sklearn.metrics import precision_score + + +def transform_entry(e, S, minsplit): + torem = list(set(filter(lambda x : x.endswith("_a") or x.endswith("_s") or x.endswith("_v") or x.endswith("_i"), S))) + sc = set(e.columns) + for x in S: + if x not in sc: + from sympy.physics.continuum_mechanics.beam import numpy + e[x] = numpy.nan + e.drop(torem, axis=1,inplace=True) + e.fillna(0,inplace=True) + if minsplit>0: + for i in range(len(e) - minsplit): + tmp = e.loc[i:i+minsplit, :] + # yield pandas.MultiIndex.from_frame(tmp) + yield {x: pandas.Series(tmp[x].to_numpy().astype(float)) for x in sorted(set(tmp.columns).difference(torem))} + #numpy.array([pandas.Series(tmp[x].to_numpy().astype(float)) for x in sorted(set(tmp.columns).difference(torem))]) + else: + # yield pandas.MultiIndex.from_frame(e) + yield {x: pandas.Series(e[x].to_numpy().astype(float)) for x in sorted(set(e.columns).difference(torem))} + +def trasform_user_class(v, S, minsplit): + for x in v: + yield from transform_entry(x, S, minsplit) + + +def transform_users(u, S, minsplit): + for k, v in u.items(): + yield (list(trasform_user_class(v, S, minsplit)), [k] * len(v)) + # return {k: list(trasform_user_class(v, S, minsplit)) for k, v in u.items()} + +def transform_all(L, S, minsplit): + for u in L: + yield from transform_users(u, S, minsplit) + +def classifier(name): + if name == 'Rocket': + from sktime.classification.kernel_based import RocketClassifier + return RocketClassifier(num_kernels=500, use_multivariate="yes") + elif name == 'TapNet': + from sktime.classification.deep_learning.tapnet import TapNetClassifier + return TapNetClassifier(n_epochs=20,batch_size=4) + elif name == 'EuclideanKNN': + from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier + return KNeighborsTimeSeriesClassifier(distance="euclidean") + elif name == 'CanonicalIntervalForest': + from sktime.classification.interval_based import CanonicalIntervalForest + return CanonicalIntervalForest(n_estimators=3, n_intervals=2, att_subsample_size=2) + elif name == 'ShapeletTransformClassifier': + from sktime.classification.shapelet_based import ShapeletTransformClassifier + return ShapeletTransformClassifier() + elif name == 'SignatureClassifier': + from sktime.classification.feature_based import SignatureClassifier + return SignatureClassifier() + +classifiers = ['Rocket', 'TapNet', 'EuclideanKNN', 'CanonicalIntervalForest', 'ShapeletTransformClassifier'] + +def transform(L, S, minsplit): + yL = [] + xL = [] + for x, y in transform_all(L, S, minsplit): + for xs, ys in zip(x,y): + xL.append(xs) + yL.append(ys) + import numpy + xL = pandas.DataFrame(xL) + yL = numpy.array(yL) + from sklearn.model_selection import train_test_split + X_train, X_test, y_train, y_test = train_test_split(xL, yL, test_size=0.3, stratify=yL) + d = dict() + p = dict() + for name in classifiers: + clf = classifier(name) + try: + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + from sklearn.metrics import accuracy_score + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred) + except: + accuracy = 0 + precision = 0 + print(f"{name}: Accuracy={accuracy}, Precision={precision}") + d[name] = accuracy + p[name] = precision + return (d, p) + +def log_json_to_numpy(filename_json, dominsplit=False): + data = None + if not os.path.exists(filename_json): + return None + with open(filename_json, "r") as f: + data = json.load(f) + data = data["log"] + S = set() + userL = [] + minsplit = sys.maxsize + for user in data: + prevClass = None + df = defaultdict(list) + buildup = [] + for event in user["__events"]: + payload = event[0] + __class = 1 if payload.pop("__class") == "Ok" else 0 + payload.pop("__label") + payload.pop("day") + payload.pop("time") + payload.pop("fulltime") + payload["class"] = __class + if prevClass == None: + prevClass = __class + if prevClass == __class: + buildup.append(payload) + else: + minsplit = min(minsplit, len(buildup)) + df[prevClass].append(pandas.DataFrame(buildup)) + prevClass= __class + S = S.union(set(payload.keys())) + minsplit = min(minsplit, len(buildup)) + df[prevClass].append(pandas.DataFrame(buildup)) + userL.append(df) + if dominsplit is False: + minsplit = -1 + return transform(userL, S, minsplit) + + + + + +if __name__ == "__main__": + aL = [] + pL = [] + for _ in range(20): + a, p = log_json_to_numpy("/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json", dominsplit=True) + aL.append(a) + pL.append(p) + pandas.DataFrame(aL).to_csv("accuracy.csv") + pandas.DataFrame(pL).to_csv("precision.csv") diff --git a/polyadic_preprocessing/medical_analysis.py b/polyadic_preprocessing/medical_analysis.py index b542fa3c..0be40de8 100644 --- a/polyadic_preprocessing/medical_analysis.py +++ b/polyadic_preprocessing/medical_analysis.py @@ -23,7 +23,7 @@ from timeseries.MultiTraceIndexing import MultiTraceIndexing from timeseries.SequentialPatternMining import SequentialPatternMining, MiningConfiguration from timeseries.TimeSeriesMining import mine_binary_growth_patterns -from prefixspan import PrefixSpan +# from prefixspan import PrefixSpan def extendDictionaryWithTime(d): t = d.log.projectProperties("time", lambda l: min(filter(lambda x : isinstance(x, str), l))) diff --git a/polyadic_preprocessing/crawl_results_model_stats.py b/polyadic_preprocessing/stats_crawl_results_model_stats.py similarity index 98% rename from polyadic_preprocessing/crawl_results_model_stats.py rename to polyadic_preprocessing/stats_crawl_results_model_stats.py index 2333391c..3b844493 100644 --- a/polyadic_preprocessing/crawl_results_model_stats.py +++ b/polyadic_preprocessing/stats_crawl_results_model_stats.py @@ -4,10 +4,10 @@ import pandas import math -supps = set() -reds = set() -poly = set() -rec = set() +# supps = set() +# reds = set() +# poly = set() +# rec = set() filename_fileds = ["mining_supp","reduction","isFilenamePolyadic","reclassify"] S = set(["Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence", "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"]) root_dir = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/" diff --git a/polyadic_preprocessing/utils.py b/polyadic_preprocessing/utils.py index 94417558..ffdc934a 100644 --- a/polyadic_preprocessing/utils.py +++ b/polyadic_preprocessing/utils.py @@ -19,6 +19,42 @@ def time_in_range(start, delta, x): return start <= x or x <= end +class ForParsing: + def __init__(self, root_dir, filename_fileds=None, S=None, untimed=None): + self.filename_fileds = filename_fileds + self.root_dir = root_dir + if self.filename_fileds is None: + self.filename_fileds = ["mining_supp", "reduction", "isFilenamePolyadic", "reclassify"] + else: + self.filename_fileds = list(self.filename_fileds) + self.S = S + if self.S is None: + self.S = {"Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence", + "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"} + else: + self.S = set(self.S) + self.untimed = untimed + if self.untimed is None: + self.untimed = {"Choice", "RespExistence", "CoExistence", "Choice", "ExclChoice"} + else: + self.untimed = set(self.untimed) + + def yielder(self, ff): + import glob + from pathlib import Path + for filename in glob.iglob(self.root_dir + '**/*.txt', recursive=True): + with open(filename, 'r') as f: + stem = Path(filename).stem.split("_") + clazz = stem[-1].replace("clazz=", "") + stem = stem[:-1] + d = dict(zip(self.filename_fileds, stem[-len(self.filename_fileds):])) + # dtmp = dict() + d["class"] = clazz + d["filename"] = "_".join(stem[:-len(self.filename_fileds)]) + key = tuple(stem[-len(self.filename_fileds):]) + ff(filename, key, d) + + def export_text2(decision_tree, feature_names=None, spacing=3, decimals=5, show_weights=False):