From e7fa53dee6e7b7923743492c2498551c6a44c955 Mon Sep 17 00:00:00 2001
From: Giacomo Bergami <bergamigiacomo@gmail.com>
Date: Wed, 19 Jun 2024 21:51:34 +0100
Subject: [PATCH] Latest version

---
 benchmark_model_size.csv                      |  74 ---
 benchmark_poly.csv                            |  10 +
 expand_elements.sh                            |   3 +
 .../knobab/mining/polyadic/polyadic_bolt.h    |  57 ++-
 new_pipeline.cpp                              |   1 +
 poly_bench.sh                                 |   6 +-
 .../02_crawl_single_model.py                  | 196 ++++++++
 .../04_learn_from_cpp_csvs.py                 | 126 +++++
 .../__pycache__/utils.cpython-310.pyc         | Bin 0 -> 5923 bytes
 polyadic_preprocessing/crawl_single_model.py  |  59 ---
 polyadic_preprocessing/external/__init__.py   |   0
 .../external/rocket_training.py               |   9 +
 .../external/windowshap/README.md             |  17 +
 .../external/windowshap/__init__.py           |   0
 .../external/windowshap/windowshap.py         | 444 ++++++++++++++++++
 polyadic_preprocessing/learn_from_cpp_csvs.py |  72 ---
 polyadic_preprocessing/log_json_to_numpy.py   | 145 ++++++
 polyadic_preprocessing/medical_analysis.py    |   2 +-
 ....py => stats_crawl_results_model_stats.py} |   8 +-
 polyadic_preprocessing/utils.py               |  36 ++
 20 files changed, 1038 insertions(+), 227 deletions(-)
 delete mode 100644 benchmark_model_size.csv
 create mode 100755 expand_elements.sh
 create mode 100644 polyadic_preprocessing/02_crawl_single_model.py
 create mode 100644 polyadic_preprocessing/04_learn_from_cpp_csvs.py
 create mode 100644 polyadic_preprocessing/__pycache__/utils.cpython-310.pyc
 delete mode 100644 polyadic_preprocessing/crawl_single_model.py
 create mode 100644 polyadic_preprocessing/external/__init__.py
 create mode 100644 polyadic_preprocessing/external/rocket_training.py
 create mode 100644 polyadic_preprocessing/external/windowshap/README.md
 create mode 100644 polyadic_preprocessing/external/windowshap/__init__.py
 create mode 100644 polyadic_preprocessing/external/windowshap/windowshap.py
 delete mode 100644 polyadic_preprocessing/learn_from_cpp_csvs.py
 create mode 100644 polyadic_preprocessing/log_json_to_numpy.py
 rename polyadic_preprocessing/{crawl_results_model_stats.py => stats_crawl_results_model_stats.py} (98%)

diff --git a/benchmark_model_size.csv b/benchmark_model_size.csv
deleted file mode 100644
index 77c5db0c..00000000
--- a/benchmark_model_size.csv
+++ /dev/null
@@ -1,74 +0,0 @@
-filename_polyadic,mining_supp,reduction,reclassify,isFilenamePolyadic,cpp_preprocess,loading,indexing,mining,refining,0,1
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,1099.4,1689.05,246.804,7251.89,114.59,9439,479
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,1141.24,2007.94,253.875,8002.66,0,533732,75820
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,1219.53,2004.85,359.461,9408.3,122.549,7610,173
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,1133.01,1758.15,259.612,7604.47,0,530206,75139
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,1120.33,1761.42,273.884,3126.8,62.5259,6498,478
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,1157.08,1886.24,260.621,3512.22,0,315106,32029
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,1222.29,1920,284.128,3396.46,61.2653,6756,172
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,1273.18,1950.75,305.145,3399.54,0,314490,31583
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,1252.64,1869.1,266.854,1477.86,33.4806,5702,478
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,1219.19,1855.42,302.62,1503.69,0,147768,11695
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,1190.85,1976.92,331.079,1648.09,41.9786,6061,172
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,1238.49,1853.81,279.515,1431.37,0,147540,11302
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,1280.93,1869.98,366.512,322.316,11.0774,5487,478
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,1137.45,1754.38,264.683,284.171,0,25538,3404
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,1152.76,1773.71,262.786,286.108,10.2654,5836,172
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,1136.65,1870.08,248.834,285.844,0,25742,3149
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,1123.92,1738.98,247.793,186.474,8.96673,0,2764
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1198.62,1837.25,262.961,202,0,298,52030
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,1107.49,1719.93,244.682,186.376,8.39929,0,1959
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,1185.12,1809.28,244.711,196.961,0,298,52637
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,1119.2,1725.64,237.594,127.964,7.24247,5005,478
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,1130.31,1735.11,245.407,122.864,0,6953,2484
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,1149.12,1760.43,241.127,129.149,7.1985,5336,172
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,1079.38,1709.02,241.849,117.925,0,7180,2211
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1301.58,2163.63,368.928,222.438,0,298,52217
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,1118.84,1741.4,285.376,7746.16,113.068,7103,154
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,1144.55,1764.23,270.079,8023.76,0,525574,73332
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,1161.52,1789.42,270.583,9020.05,130.568,7382,154
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,1195.64,1849.54,303.424,8893.18,0,521259,72394
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,1175.35,1861.26,295.849,3406.37,63.2159,6499,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,1205.8,1939.56,262.098,3471.62,0,312047,31103
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,1155.33,1761.74,257.97,3011.85,58.2738,6335,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,1207.61,1853.37,253.524,3168.34,0,310971,30643
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,1149.51,1767.3,248.628,1354.25,32.6771,5789,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,1186.56,1828.07,255.645,1435.23,0,144941,11084
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,1193.54,1817.34,249.68,1420.82,32.9605,5624,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,1173.8,1822.78,265.152,1430.22,0,144124,10634
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,1167.32,1773.89,248.039,273.39,9.6823,5573,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,1156.8,1888.85,330.842,293.511,0,23294,2417
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,1211.65,2053.25,350.924,300.892,10.3833,5398,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,1217.38,1852.89,269.338,272.204,0,22758,2142
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,1152.48,1783.94,273.103,191.392,8.47948,0,2233
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,1147.74,1806.4,268.174,197.1,0,298,52217
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,1132.37,1769.65,284.116,193.452,10.5667,0,2499
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,1231.13,1882.52,269.057,199.144,0,298,51088
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,1164.66,1767.48,270.478,125.169,6.51384,5076,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,1128.52,1771.37,348.037,121.904,0,5967,1732
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,1245.87,1900.8,284.205,123.48,6.64354,4904,153
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,1190.53,1814.45,265.941,108.35,0,5412,1466
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,1,369.196,1130.89,240.681,5347.68,79.1541,4897,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,1,370.103,1110.48,217.76,5461.91,0,307553,53479
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,1,0,0,390.888,1126.82,213.156,5326.23,84.1707,5061,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.2,0,0,0,364.473,1068.24,199.103,5507.18,0,305736,52997
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,1,395.096,1154.67,178.822,2235.38,48.929,4636,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,1,335.157,994.656,132.263,1932.11,0,201911,20528
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,1,0,0,325.114,942.691,133.291,1916.34,44.3508,4679,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.4,0,0,0,353.597,979.619,150.706,2078.34,0,201518,20423
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,1,350.133,980.558,141.927,1351.27,33.3041,3855,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,1,416.959,1090.24,187.732,1172.45,0,99282,6374
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,1,0,0,343.394,1018.68,173.083,1001.95,23.2435,3842,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.6,0,0,0,348.539,1007.23,156.05,1076.86,0,98849,6236
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,1,369.17,1025.38,196.532,188.573,6.35762,3406,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,1,315.176,970.604,197.169,193.359,0,17077,1180
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,1,0,0,331.102,989.719,165.18,176.651,7.10374,3399,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.8,0,0,0,345.19,1170.48,287.932,206.732,0,16972,1130
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,1,360.264,1124.02,205.661,100.691,3.00067,0,744
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,1,333.938,993.624,167.747,98.4587,0,200,22686
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,1,0,0,342.206,997.067,157.278,97.1346,3.165,0,785
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,1,0,0,0,337.03,1026.49,179.884,97.0274,0,200,22547
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,1,325.013,979.388,165.127,64.723,4.23475,3135,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,356.321,1007.08,131.226,56.9022,0,3447,795
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,328,911.844,128.183,57.1585,4.14329,3114,82
-/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,319.621,903.721,130.917,53.0961,0,3331,753
diff --git a/benchmark_poly.csv b/benchmark_poly.csv
index 107e49ae..3602517b 100644
--- a/benchmark_poly.csv
+++ b/benchmark_poly.csv
@@ -72,3 +72,13 @@ filename_polyadic,mining_supp,reduction,reclassify,isFilenamePolyadic,cpp_prepro
 /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,1,356.321,1007.08,131.226,56.9022,0
 /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,1,0,0,328,911.844,128.183,57.1585,4.14329
 /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0.9,0,0,0,319.621,903.721,130.917,53.0961,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,300.86,800.527,117.078,20082.6,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,328.465,840.729,140.021,23425,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,293.664,774.624,116.497,19526.1,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,2886.19,9413,813.953,106920,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,2830.63,9298.7,568.097,110897,595.166
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,305.666,884.449,135.671,22015.3,175.412
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,1,331.926,843.255,136.441,21827.9,247.775
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,1,322.554,931.787,212.557,22430.9,0
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,1,0,0,308.326,856.088,150.196,23294.9,172.628
+/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json,0,0,0,0,303.417,828.267,130.524,22808.3,0
diff --git a/expand_elements.sh b/expand_elements.sh
new file mode 100755
index 00000000..31ec4bfe
--- /dev/null
+++ b/expand_elements.sh
@@ -0,0 +1,3 @@
+find /home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/ -maxdepth 1 -mindepth 1 -type d |while read fname; do
+  ./cmake-build-release/knobab_json -f "$fname" -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json;
+done
\ No newline at end of file
diff --git a/include/knobab/mining/polyadic/polyadic_bolt.h b/include/knobab/mining/polyadic/polyadic_bolt.h
index 4651d456..5cb8cd36 100644
--- a/include/knobab/mining/polyadic/polyadic_bolt.h
+++ b/include/knobab/mining/polyadic/polyadic_bolt.h
@@ -107,6 +107,11 @@ struct result_container {
             if ((val == 1) || (val == -2))
                 val = 0;
         }
+        for (size_t i = 0; i<total_log; i++)  {
+            if (embeddings[i]==-2) {
+                embeddings[i] = 0;
+            }
+        }
     }
 };
 
@@ -956,7 +961,7 @@ struct polyadic_bolt {
         Phi.erase(std::unique(Phi.begin(), Phi.end(), [](const pattern_mining_result<FastDatalessClause>& l, const pattern_mining_result<FastDatalessClause>& r) {
             return std::tie(l.clause.casusu, l.clause.left, l.clause.right, l.clause.n) == std::tie(r.clause.casusu, r.clause.left, r.clause.right, r.clause.n);
         }), Phi.end());
-        DEBUG_ASSERT(curr_size_Clauses == Phi.size());
+//        DEBUG_ASSERT(curr_size_Clauses == Phi.size());
     }
 
     inline void mdev(size_t i) {
@@ -1011,6 +1016,7 @@ struct polyadic_bolt {
         }
         setKnowledgeBaseAndInit(ptr);
         std::vector<size_t> actLabels;
+        std::unordered_set<size_t> act_to_consider;
         std::unordered_map<std::tuple<std::string,std::string,std::string>, std::vector<char>> result_map;
         std::unordered_map<size_t, std::vector<size_t>> act_Labels;
         std::unordered_map<size_t, std::vector<size_t>> noact_Labels;
@@ -1019,19 +1025,24 @@ struct polyadic_bolt {
         ssize_t trace_id = -1;
         size_t log_size = ptr->nTraces();
         for (const auto& x : acts) {
-            size_t id = ptr->event_label_mapper.get(x);
-            actLabels.emplace_back(id);
-            auto a_beginend = kb->timed_dataless_exists(id);
-            auto& v = act_Labels[id];
-            trace_id = -1;
-            while (a_beginend.first != a_beginend.second) {
-                if (trace_id != a_beginend.first->entry.id.parts.trace_id) {
-                    trace_id = a_beginend.first->entry.id.parts.trace_id;
-                    v.emplace_back(trace_id);
+            if(ptr->event_label_mapper.signed_get(x)>0) {
+                size_t id = ptr->event_label_mapper.get(x);
+                actLabels.emplace_back(id);
+                act_to_consider.insert(id);
+                auto a_beginend = kb->timed_dataless_exists(id);
+                auto& v = act_Labels[id];
+                trace_id = -1;
+                while (a_beginend.first != a_beginend.second) {
+                    if (trace_id != a_beginend.first->entry.id.parts.trace_id) {
+                        trace_id = a_beginend.first->entry.id.parts.trace_id;
+                        v.emplace_back(trace_id);
+                    }
+                    a_beginend.first++;
                 }
-                a_beginend.first++;
+                set_complement(log_size, v.begin(), v.end(), std::back_inserter(noact_Labels[id]));
+            } else {
+//                std::cerr << x << std::endl;
             }
-            set_complement(log_size, v.begin(), v.end(), std::back_inserter(noact_Labels[id]));
         }
 //        remove_duplicates(actLabels);
         FastDatalessClause clause;
@@ -1045,16 +1056,19 @@ struct polyadic_bolt {
         for (size_t trace_id = 0; trace_id < log_size; trace_id++) {
             const auto& first_last =kb->act_table_by_act_id.secondary_index.at(trace_id);
             for (auto it = first_last.first->begin(), en = first_last.first->end(); it!=en; it++) {
-                first[it->first].emplace_back(trace_id);
+                if (act_to_consider.contains(it->first))
+                    first[it->first].emplace_back(trace_id);
             }
             for (auto it = first_last.second->begin(), en = first_last.second->end(); it!=en; it++) {
-                last[it->first].emplace_back(trace_id);
+                if (act_to_consider.contains(it->first))
+                    last[it->first].emplace_back(trace_id);
             }
         }
         std::cout << "First..." << std::endl;
         std::tuple<std::string,std::string,std::string> simplistic_clause{"Init","","§1"};
         for (const auto& [act_id, traces] : first) {
             all_VIOL.clear();
+            if (!act_to_consider.contains(act_id)) continue;
             std::get<1>(simplistic_clause) = ptr->event_label_mapper.get(act_id);
             auto& v = result_map[simplistic_clause];
             v.resize(log_size, -1);
@@ -1066,6 +1080,7 @@ struct polyadic_bolt {
         std::cout << "Last..." << std::endl;
         for (const auto& [act_id, traces] : last) {
             all_VIOL.clear();
+            if (!act_to_consider.contains(act_id)) continue;
             std::get<1>(simplistic_clause) = ptr->event_label_mapper.get(act_id);
             auto& v = result_map[simplistic_clause];
             v.resize(log_size, -1);
@@ -1078,10 +1093,15 @@ struct polyadic_bolt {
         std::get<0>(simplistic_clause) = "Exists";
         std::cout << "Exists..." << std::endl;
         for (const auto& [act_id, countings] : exists) {
+            if(ptr->event_label_mapper.signed_get(act_id)<0) {
+                continue;
+            }
+            if (!act_to_consider.contains(ptr->event_label_mapper.get(act_id))) continue;
             std::get<1>(simplistic_clause) = act_id;
             auto indexes = ptr->resolveCountingData(act_id);
             if ((indexes.first == indexes.second) && (indexes.first == (uint32_t)-1)) {
-                exit(5);
+                continue;
+//                exit(5);
             } else {
                 std::unordered_map<size_t, std::string> MAP;
                 for (size_t count : countings) {
@@ -1102,10 +1122,15 @@ struct polyadic_bolt {
         std::get<0>(simplistic_clause) = "Absence";
         std::cout << "Absence..." << std::endl;
         for (const auto& [act_id, countings] : absence) {
+            if(ptr->event_label_mapper.signed_get(act_id)<0) {
+                continue;
+            }
+            if (!act_to_consider.contains(ptr->event_label_mapper.get(act_id))) continue;
             std::get<1>(simplistic_clause) = act_id;
             auto indexes = ptr->resolveCountingData(act_id);
             if ((indexes.first == indexes.second) && (indexes.first == (uint32_t)-1)) {
-                exit(4);
+                continue;
+//                exit(4);
             } else {
                 std::unordered_map<size_t, std::string> MAP;
                 for (size_t count : countings) {
diff --git a/new_pipeline.cpp b/new_pipeline.cpp
index b2f002f7..1d854add 100644
--- a/new_pipeline.cpp
+++ b/new_pipeline.cpp
@@ -102,6 +102,7 @@ struct benchmarking {
 #include <yaucl/strings/string_utils.h>
 
 int main(int argc, char **argv) {
+    // Phases 01 (mining the models from the data) and 03 (deriving the decision tree structure)
 
     // CyberSecurity configuration:
     // -s 0.8 --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab --nonPoly=tab /home/giacomo/Scaricati/classes/Adware.tab_100.tab /home/giacomo/Scaricati/classes/Backdoor.tab_100.tab /home/giacomo/Scaricati/classes/Downloader.tab_100.tab /home/giacomo/Scaricati/classes/Dropper.tab_100.tab /home/giacomo/Scaricati/classes/Spyware.tab_100.tab /home/giacomo/Scaricati/classes/Trojan.tab_100.tab /home/giacomo/Scaricati/classes/Virus.tab_100.tab /home/giacomo/Scaricati/classes/Worms.tab_100.tab
diff --git a/poly_bench.sh b/poly_bench.sh
index dffd92c5..19a25294 100755
--- a/poly_bench.sh
+++ b/poly_bench.sh
@@ -9,4 +9,8 @@ done
 ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
 ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
 ./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -l -p  /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
-./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
\ No newline at end of file
+./cmake-build-release/knobab_json -s 0.9 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
+./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
+./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
+./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -r -l -p  /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
+./cmake-build-release/knobab_json -s 0.0 -d user -i day -i span -i "__class" -i "__label" -i time -i fulltime -l -p /home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json
\ No newline at end of file
diff --git a/polyadic_preprocessing/02_crawl_single_model.py b/polyadic_preprocessing/02_crawl_single_model.py
new file mode 100644
index 00000000..ab9d4b84
--- /dev/null
+++ b/polyadic_preprocessing/02_crawl_single_model.py
@@ -0,0 +1,196 @@
+import glob
+import os.path
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+from utils import ForParsing
+
+
+class ProcessClasses:
+    def __init__(self):
+        self.d_exists = dict()
+        self.d_absences = dict()
+        self.acts = set()
+        self.files = set()
+
+    def dump(self, folder, rem=None):
+        from pathlib import Path
+        if isinstance(folder, str):
+            path = Path(folder)
+        if isinstance(folder, Path):
+            path = folder
+        else:
+            path = Path(folder)
+        print("Writing acts....")
+        if rem is not None:
+            if rem in self.acts:
+                self.acts.remove(rem)
+            if rem in self.d_exists:
+                self.d_exists.pop(rem)
+            if rem in self.d_absences:
+                self.d_absences.pop(rem)
+        with open(os.path.join(path.absolute(), "acts.txt"), "w") as fp:
+            fp.write(os.linesep.join(self.acts))
+        with open(os.path.join(path.absolute(), "exists.txt"), "w") as fp:
+            fp.write(os.linesep.join(
+                [" ".join([str(len(self.d_exists[act]))] + [str(x) for x in self.d_exists[act]] + [act]) for act in self.d_exists]))
+        with open(os.path.join(path.absolute(), "absences.txt"), "w") as fp:
+            fp.write(os.linesep.join(
+                [" ".join([str(len(self.d_exists[act]))] + [str(x) for x in self.d_exists[act]] + [act]) for act in self.d_absences]))
+
+    def process(self, file):
+        with open(file, "r") as f:
+            self.files.add(file)
+            for line in f.read().splitlines():
+                if line.startswith("Exists"):
+                    firstOpen = line.find('(')
+                    lastClose = line.rfind(')')
+                    lastPar = line.rfind('§')
+                    act = line[firstOpen + 1:lastPar - 1].strip()
+                    self.acts.add(act)
+                    if act not in self.d_exists:
+                        self.d_exists[act] = set()
+                    self.d_exists[act].add(int(line[lastPar + 1:lastClose]))
+                elif line.startswith("Absence"):
+                    firstOpen = line.find('(')
+                    lastClose = line.rfind(')')
+                    lastPar = line.rfind('§')
+                    act = line[firstOpen + 1:lastPar - 1].strip()
+                    self.acts.add(act)
+                    if act not in self.d_absences:
+                        self.d_absences[act] = set()
+                    self.d_absences[act].add(int(line[lastPar + 1:lastClose]))
+                elif line.startswith("Init") or line.startswith("End"):
+                    firstOpen = line.find('(')
+                    lastClose = line.rfind(')')
+                    lastPar = line.rfind('§')
+                    act = line[firstOpen + 1:lastPar - 1].strip()
+                    self.acts.add(act)
+                else:
+                    firstOpen = line.find('(')
+                    lastClose = line.rfind(')')
+                    comma = line.find(',')
+                    comma2 = line.rfind(',')
+                    if (comma == comma2):
+                        act = line[firstOpen + 1:comma].strip()
+                        self.acts.add(act)
+                        act = line[comma + 1:lastClose].strip()
+                        self.acts.add(act)
+                    else:
+                        print("PARSING ERROR")
+                        sys.exit(1)
+
+
+file = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/log_weekly.json_1_1_1_0_clazz=1.txt"
+
+def old():
+    pc = ProcessClasses()
+    pc.process(file)
+    from pathlib import Path
+    pc.dump(Path(file).parent.absolute())
+
+
+
+def neu():
+    filename_fileds = ["mining_supp", "reduction", "isFilenamePolyadic", "reclassify"]
+    S = set(["Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence",
+             "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"])
+    root_dir = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/"
+    untimed = set(["Choice", "RespExistence", "CoExistence", "Choice", "ExclChoice"])
+    timed = S.difference(untimed)
+    # desirGlobal = dict()
+
+    # root_dir needs a trailing slash (i.e. /root/dir/)
+    L = []
+    L2 = []
+    for filename in glob.iglob(root_dir + '**/*.txt', recursive=True):
+        with open(filename, 'r') as f:
+            stem = Path(filename).stem.split("_")
+            clazz = stem[-1].replace("clazz=", "")
+            stem = stem[:-1]
+            d = dict(zip(filename_fileds, stem[-len(filename_fileds):]))
+            dtmp = dict()
+            d["class"] = clazz
+            d["filename"] = "_".join(stem[:-len(filename_fileds)])
+            key = tuple(stem[-len(filename_fileds):])
+            if "isFilenamePolyadic" in d:
+                d.pop("isFilenamePolyadic")
+                d.pop("reclassify")
+                d.pop("class")
+                d.pop("filename")
+                for i in range(3):
+                    minsupp_reduction_conf[d["mining_supp"]][d["reduction"]][i] = "ciao"
+
+
+minsupp_reduction_conf = defaultdict(lambda : defaultdict(lambda : defaultdict(ProcessClasses)))
+def yi(file, key, d):
+    if "isFilenamePolyadic" in d:
+        d.pop("reclassify")
+        d.pop("class")
+        d.pop("filename")
+        minsupp_reduction_conf[d["isFilenamePolyadic"]][d["mining_supp"]][d["reduction"]].process(file)
+
+path = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/"
+fp = ForParsing(path)
+fp.yielder(yi)
+for poly, inner in minsupp_reduction_conf.items():
+    NP = "nopoly" if (int(poly)==0 or (not poly)) else "poly"
+    for supp, values in inner.items():
+        for red, obj in values.items():
+            p = Path(os.path.join(path, f"{NP}_s{supp}_{red}"))
+            p.mkdir(parents=True, exist_ok=True)
+            obj.dump(p, "__missing")
+
+# neu()
+# print(minsupp_reduction_conf)
+# with open(file, "r") as f:
+#     for line in f.read().splitlines():
+#         if line.startswith("Exists"):
+#             firstOpen = line.find('(')
+#             lastClose = line.rfind(')')
+#             lastPar = line.rfind('§')
+#             act = line[firstOpen + 1:lastPar - 1].strip()
+#             acts.add(act)
+#             if act not in d_exists:
+#                 d_exists[act] = set()
+#             d_exists[act].add(int(line[lastPar+1:lastClose]))
+#         elif line.startswith("Absence"):
+#             firstOpen = line.find('(')
+#             lastClose = line.rfind(')')
+#             lastPar = line.rfind('§')
+#             act = line[firstOpen + 1:lastPar - 1].strip()
+#             acts.add(act)
+#             if act not in d_absences:
+#                 d_absences[act] = set()
+#             d_absences[act].add(int(line[lastPar+1:lastClose]))
+#         elif line.startswith("Init") or line.startswith("End"):
+#             firstOpen = line.find('(')
+#             lastClose = line.rfind(')')
+#             lastPar = line.rfind('§')
+#             act = line[firstOpen + 1:lastPar - 1].strip()
+#             acts.add(act)
+#         else:
+#             firstOpen = line.find('(')
+#             lastClose = line.rfind(')')
+#             comma = line.find(',')
+#             comma2 = line.rfind(',')
+#             if (comma == comma2):
+#                 act = line[firstOpen + 1:comma].strip()
+#                 acts.add(act)
+#                 act = line[comma +1:lastClose].strip()
+#                 acts.add(act)
+#             else:
+#                 sys.exit(1)
+#
+#
+# path = Path(file)
+# print("Writing acts....")
+# with open(os.path.join(path.parent.absolute(), "acts.txt"), "w") as fp:
+#     fp.write(os.linesep.join(acts))
+#
+# with open(os.path.join(path.parent.absolute(), "exists.txt"), "w") as fp:
+#     fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_exists]))
+#
+# with open(os.path.join(path.parent.absolute(), "absences.txt"), "w") as fp:
+#     fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_absences]))
\ No newline at end of file
diff --git a/polyadic_preprocessing/04_learn_from_cpp_csvs.py b/polyadic_preprocessing/04_learn_from_cpp_csvs.py
new file mode 100644
index 00000000..e975ba9b
--- /dev/null
+++ b/polyadic_preprocessing/04_learn_from_cpp_csvs.py
@@ -0,0 +1,126 @@
+import os
+import sys
+from collections import defaultdict
+
+import pandas
+from sklearn.metrics import accuracy_score, precision_score
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+
+from utils import export_text2
+
+from pathlib import Path
+folder = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/nopoly_s0_1/"
+class0 = os.path.join(folder, "output_csv_0.csv")
+class1 = os.path.join(folder, "output_csv_1.csv")
+spec = None#"/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt"
+modelfile = Path(folder).name + ".txt"
+
+def loadDataset(class0, class1):
+    df0 = pandas.read_csv(class0, index_col=0, header=None).transpose()
+    df0['class'] = 0
+    df1 = pandas.read_csv(class1, index_col=0, header=None).transpose()
+    df1['class'] = 1
+    return pandas.concat([df0, df1], axis=0, ignore_index=True).fillna(-1)
+
+def readFileForSpec(filename):
+    S = set()
+    with open(filename, "r") as f:
+        for line in f.readlines():
+            S.add(line)
+            coex = line.find("CoExistence(")
+            cho = line.find("Choice(")
+            excl = line.find("ExclChoice(")
+            firstOpen = line.find('(')
+            lastClose = line.rfind(')')
+            if (coex==0) or cho==0 or excl==0:
+                comma = line.find(',')
+                comma2 = line.rfind(',')
+                if (comma == comma2):
+                    act1 = line[firstOpen + 1:comma].strip()
+                    act2 = line[comma + 1:lastClose].strip()
+                    if (coex==0):
+                        S.add("CoExistence("+act2+","+act1+")")
+                    elif (cho==0):
+                        S.add("Choice("+act2+","+act1+")")
+                    elif (excl==1):
+                        S.add("ExclChoice("+act2+","+act1+")")
+                else:
+                    sys.exit(1)
+    return S
+
+class LearnRepresentation:
+    def __init__(self, folder, spec=None):
+        self.class0 = os.path.join(folder, "output_csv_0.csv")
+        self.class1 = os.path.join(folder, "output_csv_1.csv")
+        self.spec = spec  # "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt"
+        self.modelfile = Path(folder).name + ".txt"
+        self.dict_list = loadDataset(class0, class1)
+        if spec is not None:
+            S = readFileForSpec(spec)
+            dict_list = self.dict_list[list(set(self.dict_list.columns).intersection(S)) + ["class"]]
+
+    def test(self, poly, supp, red, outcomes):
+        if self.dict_list.empty or (len(set(self.dict_list.columns)) == 1 and ("class" in set(self.dict_list.columns))):
+            print("No data")
+        else:
+            X = self.dict_list.drop(labels=['class'], axis=1)
+            y = self.dict_list['class']
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
+            rf = DecisionTreeClassifier(criterion="gini", max_depth=5)
+            rf.fit(X_train, y_train)
+            y_pred = rf.predict(X_test)
+            accuracy = accuracy_score(y_test, y_pred)
+            precision = precision_score(y_test, y_pred)
+            print(f"poly: {poly}\tsupp: {supp}\tred: {red}\tacc: {accuracy}\tprec: {precision}")
+            outcomes.append({"poly": poly, "supp": supp, "red": red, "accuracy": accuracy, "precision": precision, "model":os.linesep.join(export_text2(rf, X.columns, show_weights=True))})
+            # print("Accuracy:", accuracy)
+            # with open(modelfile, "w") as file:
+            #     file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True)))
+            #     file.write(os.linesep + ("Accuracy: ") + str(accuracy))
+
+def genfolder(poly, s, red):
+    poly = "poly" if (poly or (int(poly) == 1)) else "nopoly"
+    red = 1 if (red or (int(red) == 1)) else 0
+    return f"{poly}_s{s}_{red}"
+
+outcomes = []
+path = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/"
+# name = genfolder(1, 1.0, 1)
+# abs_folder = os.path.join(path, name)
+# lr = LearnRepresentation(abs_folder)
+# for _ in range(20):
+    # lr.test(1, 1.0, 1, outcomes)
+for name in os.listdir(path):
+    abs_folder = os.path.join(path, name)
+    if os.path.isdir(abs_folder):
+        print(name)
+        lr = LearnRepresentation(abs_folder)
+        arr = name.split("_")
+        arr[0] = 0 if arr[0] == "nopoly" else 1
+        arr[1] = float(arr[1][1:])
+        for _ in range(20):
+            lr.test(arr[0], arr[1], int(arr[2])==1, outcomes)
+pandas.DataFrame(outcomes).to_csv("results_proposed.csv",index=False)
+
+
+
+
+
+
+# if dict_list.empty or (len(set(dict_list.columns)) == 1 and ("class" in set(dict_list.columns))):
+#     print("No data")
+#     with open(modelfile, "w") as file:
+#         file.write("No data")
+# else:
+#     X = dict_list.drop(labels=['class'], axis=1)
+#     y = dict_list['class']
+#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
+#     rf = DecisionTreeClassifier(criterion="entropy")
+#     rf.fit(X_train, y_train)
+#     y_pred = rf.predict(X_test)
+#     accuracy = accuracy_score(y_test, y_pred)
+#     print("Accuracy:", accuracy)
+#     with open(modelfile, "w") as file:
+#         file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True)))
+#         file.write(os.linesep + ("Accuracy: ") + str(accuracy))
\ No newline at end of file
diff --git a/polyadic_preprocessing/__pycache__/utils.cpython-310.pyc b/polyadic_preprocessing/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90d517f225fd7c8170ab2ab808a0a46bd5f8ea8d
GIT binary patch
literal 5923
zcma)A&2JpJl`qy;SGUx%?Af))!6aMj>}1r2(voM9k5MdbW^8$5AgeJf<zqE43Z*8y
zyQJ!>Hj6!$+TBR7^1|{VS<GSYasmX%br1W;@H!`7a|tjP=l9s3@;E`NyIADo<KyGw
zkB^VctJR8y=e_?=n_s?XSzl6N{&7(FG2Y@K5^iypTKl6f+qY4+Q>WwZyB%-eV-~-}
zUG6<&`#$%1>6x`(`ge;5y!_1K<x_h<D6{e+U*=W51fDWyyH@S;E6BBWYPP<5S2U6$
z$+JBv#GN!Qilmtcsl7;n@QQ)0VH5NCp?n$d9lS+|L|KQ>kJ|$_uny_9@j7^2?);{N
zzTBPU|4No6T`<>5+C335#CP&cwY9_J6YaM1p45IP$$Cm?zYvW)<9eZ!G-N(1)I1&V
zSP7MMgbW}@k_wV+C>Kbqt=X|={EwErgyL7*Ywf%v)>=v2$UFI3SLTPJp^CMmEI)`3
zK8(`5)e_lSH&0Juo;0Ga6yRwH3_feE^;D7;_3nxGpvdPrbhip0$yMeu|Fyh~Y=b6f
z^3cGjY)tYKWRx|qPT7#YHoh>(fL*hOFv`F=V)8wFZ5K_BSbm)-yOwP+YZt8s_7UrT
zdti^KHP-*`L!o*y3zh7NFlmOzVN!$%%54$KIBSXUvqHsEt%f4wk7~?lS97&zsM_Nq
zMRgTQX$Ks}JfmcpXjQUklw^^SMW!qkkpy<Y==UyLw&g#9(7=1)q4EA0Z}BZ8!rEut
z0x*4Ua|a-E3?Qw2*MP00EBA8wAeLm^orZ@NlZWzzC}X;3$YAJJ1al!JECKU9OI~gP
zVZS4qmHCSB1@js{m^31+Ifn23Hde;(sm1}HOLW8M43LjJls&_@0Y1m@p}cDuPS0?f
zG7t+30)H9wjf!5ktIJaGUPED-=qpKaFG)ofcf^A+eso2O#%SrBY%O6F+Svwz-TPUh
zw120aCk^0w`#337p-Ueh6e0s5cqoc)o)tn@wvQX>n62^vvII9AmEB&$fD+v<+{ujq
zQ&GLsj+1QKzDl`S?Ilw?Cuc;M3ELvX%5PHg&y@TNCEr0(^Yx|XcyJMAc%cKC=PKff
z)NYCaXtt}JLMUD8Wdt6MwdocjZOWS@rAEmrC3Q+@HO|2;h=8?3MbS0Ti*F$Dna?V0
zkuBQeuS4G@;I9A93^esgy9=&}mjBch{RsuQAuF|pHo2peb?l+T0H}#Gx@Hk7e`DXZ
z9vuu_<qcf!pW6SkJM;zs$SFJY&r7`Yk`4WV&x7mMIb0~OytIcUl;KeS+rDlMeN|Q!
z0BK?1t3~>rvIT&Y>P~5`2ZPce;0w1cUOKN*yBQbpykw|dylttax3p)&GJx+6%7fq(
zn*RcFUHmAY1-Go?@e{`2Cz$o0cpv`*LAO@jk>sLlOuG+a)$Rv3Z?4^}t7EkX5>VUd
zVby&uG?SV|uU_Nn`+#b{zo{+wHv+8=CR1L!q}^7UALvpSveM*0-hyoHB}6pytKGaS
zGEBG<9r&1T3V@g3JkuC}cHv5ucKS)zz(6}#CE7jAlT3T6*G)yOBG*Wu-E7K_kTK%r
z4~fK@a?|u>l#`=HbR<sRL{RA@5h)i&8RL5bfahJj%lIv_kOlU)P+ns1{n`Gt`?`N+
z4n*UoHD_mMhjx@GqD34<x)Md5ocB`7FGkVhUYw3<<T`12pOP7pO^&>$yo!WgvpWP9
zaQu#;y&~`UihSF>3*L@)5_+k+&`1Cv>qQAEdjasifct!bU32K2+1%|hc|+M}0M%LP
z&^dBY0(>byqBiZT@Phxv25`IBR8~=28jNXY3^YrAAGF|%jrag~O9Alm6J7?y-Qr_j
z8Ty&c7l!`0zuo6{lkr7feHNgf(p|`Z;bZ|{0>zq9o?Fk^bLYAH+<WdnFEvYuKbD^b
zFT67k8^C3L<<tQ%--&A&>L<M<<zXBuajZh%B$p~I+W8YW*3lLQqZeV`1PdR>A=v7K
zs8=c>o}F9?@Wm=Th#N;wV#$jTJ37#r93;TrN!W=`!YoI<2+2`HzTo4=*l;ktO31><
z`tI~G#YuN2%n3Kb{D49PQ$;=RVSk;l7>z<vz}>|vm!Zn3J8?Yejri%gufj&0Q617G
z!ww8LkHJlazCF|mA#%Rvp=e1&BDoxoqbXv90K$N^fTrYy$z+9+tI)9-_foaFlV@Vu
z&&N|9tiYCPhgmchLgvd9_KwK}jOiiay{Jzm+;776nFtXj37IrT)!~yQO=)<&Lhx|q
z#ze988>`{8a0`Wcy<QtD--(YSF1pZR113S3GEw-}`n0o8v-HHEp_xbwZPp|epNrJM
zgTfucMHtJsS$<FUvIgQq9v&c&X|#c<iLRYk9)bQOfeJUE*p0DbMK^Ac5#CViU#D8_
z^g0JZ8bgB>5)XuWB19Gn-h!zNLfT2I4W2|HDqA%}c#^<5ji4<GOw719f>I~OSl;OF
z!?$!d<_&q$N>o8;;|V-%d@|7->nD05&^2l|!h<|dXJ);JPG&&BG@8JX%r_2D64(i0
zOgIrRvC{<3A?ifjXosnYn=?SrdiX06XiBJqY-X^y18-l9<d{}t)kneXh4p9*F!l@^
zglSC5)F^x8Hi}*clXzmh1n@^)N#i!gVKCW?4lwhMWBYidf{`>?_#>aPwY3#CW!?#k
zBMebybsQ{W1T+Y%JH|hydCa3kCL=Y!#%^FZW}RQQccy12)9K%AH!^laUUAAyJ%*d%
zwD*<TwCb0jQ}t&bkggv*npT`lD-h(jgqo|GSY_njHZORXnW`&TE<Eo<@Ks7|uCLA(
z-{yz2{@;{!p<Hh!YUP)!;mM3&Lan(Wu3X?>4JUrX_?*qrz%I}}cr=sM1t8Rl91#YX
z%CIX`oMJ(9tmu_SrxxDc4A<*F9Ql(0{IKCNu-gd7)VH?V3g6Q1ZPIVmfABqHb0htY
z9&XKQ{#q}8v;D{g^cSBFL;Nw8r^Cvm1P!U=o4U$Qzkb0?VW$UoId<sb`#cn>=!i_g
z6}7M>zRyi)*#G9l)fslU@2wCM-9l8Dfv5jZG&{yOP3vLLIflzmhv<LoFE)%qB_fe#
zmrUK~aoQ86ra{5z<e`HdoiyITTiig>LcsH=I%ETeO9O_Grh0~>-gPTO-2R3A*hbjE
za3>)tKGI8j54X3YpY7b;zIT6T`|clYZP(W|OAyyqFPLHOd{Mbgn<GUCTfe1F>4e~0
zHoNlL#>u@u56Tbgx6`}<Z@Gocq|w*Zrx<RRBvF`W)GrO8>^ana$tEiID-(U54xydj
z%q0Q^`2bDk!bTG@?t%6N$s^lby?Bj4%2IS_%yj@mmaQ}Y5GK2BDRyYXT>Ta<5zcI)
z3|-XW<cplg?Q{2x4LlrGJno!%=Pq|~jPPDM=N|XC`_jhOH(&pyg=0_&ml3`~w0O#R
zaOBF5czNIxZQz^-XC%2yk_U`euF;7cb*IithMOkm%=x>dm*%9mA)R_GO{L=iH76Z)
zBk2Rgjocar=zoD%j%@h?IUKUC<M4%X1{u|u@?mvEQA_8S1_6$2Wz;Ma4ecEHhpB`=
z0AOXbEh9=^xMr!#gYqe^V))`Ut3~%<)t$Y5`RT2V`qTAcbNCfok>AG*Xjb`(Rlj1j
z(E{=i&+*XQU-c{D=2rOA_&}taeLq}TUt6!$uISPj00z18QxZ{DZA5YHJmvCdD9e8%
zB5q|QPDh_m-iPzYjf}1~+K7>*$f8s<m0p?_CGGIKoJDz0b$hru@XR{XY){)+SNrhz
z2$hJz_&QPGT(00KwVUN!(A5vZF73pPhW5;ci3&t{@-yn)k!|p&?Lx*>tQZj{#KaZd
z!mNbT;lw}ar3uW9<Bs$aDI6Jnqh?3xE5=uw4JR6%JcYhwMlr|a(f%3D)~qbh|Nr7G
z=)%BumpvC3Pjn@46PE%3PINczMNmT9Wv*SJa~{%nQFE2`SLSw^xdr`oJ<Q~pxvd4V
zM<Wi#MvF$XQJICvy7F`KXWLTdQdbaKj3aqTYgX<ffr+XY_B2U8LYKylR(7ehNavhM
zoTir$hf#=%IK|kiZ03@(#uU&hC+XDE2$PQ~Cn+HVNks`MF3FGR#rc8%2$W(2iT{z0
zD@ytW_LBMf%zo{^4qR9eR;-W>@x#g34q!#_y<T?Sbr<XMS7?sgLvq)+{*wQJuiQa$
z4`+6|5!4>SyY7i>P=0X|I?-H?%t9{P+&!A>LURf>GUpuN^xUk2+oR+57VQ?r6(j+!
S9G6+e3EZUx`+b`&z5WXpbVVQl

literal 0
HcmV?d00001

diff --git a/polyadic_preprocessing/crawl_single_model.py b/polyadic_preprocessing/crawl_single_model.py
deleted file mode 100644
index 30951ba3..00000000
--- a/polyadic_preprocessing/crawl_single_model.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import os.path
-import sys
-
-file = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/log_weekly.json_1_1_1_0_clazz=1.txt"
-
-d_exists = dict()
-d_absences = dict()
-acts = set()
-
-with open(file, "r") as f:
-    for line in f.read().splitlines():
-        if line.startswith("Exists"):
-            firstOpen = line.find('(')
-            lastClose = line.rfind(')')
-            lastPar = line.rfind('§')
-            act = line[firstOpen + 1:lastPar - 1].strip()
-            acts.add(act)
-            if act not in d_exists:
-                d_exists[act] = set()
-            d_exists[act].add(int(line[lastPar+1:lastClose]))
-        elif line.startswith("Absence"):
-            firstOpen = line.find('(')
-            lastClose = line.rfind(')')
-            lastPar = line.rfind('§')
-            act = line[firstOpen + 1:lastPar - 1].strip()
-            acts.add(act)
-            if act not in d_absences:
-                d_absences[act] = set()
-            d_absences[act].add(int(line[lastPar+1:lastClose]))
-        elif line.startswith("Init") or line.startswith("End"):
-            firstOpen = line.find('(')
-            lastClose = line.rfind(')')
-            lastPar = line.rfind('§')
-            act = line[firstOpen + 1:lastPar - 1].strip()
-            acts.add(act)
-        else:
-            firstOpen = line.find('(')
-            lastClose = line.rfind(')')
-            comma = line.find(',')
-            comma2 = line.rfind(',')
-            if (comma == comma2):
-                act = line[firstOpen + 1:comma].strip()
-                acts.add(act)
-                act = line[comma +1:lastClose].strip()
-                acts.add(act)
-            else:
-                sys.exit(1)
-
-from pathlib import Path
-path = Path(file)
-print("Writing acts....")
-with open(os.path.join(path.parent.absolute(), "acts.txt"), "w") as fp:
-    fp.write(os.linesep.join(acts))
-
-with open(os.path.join(path.parent.absolute(), "exists.txt"), "w") as fp:
-    fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_exists]))
-
-with open(os.path.join(path.parent.absolute(), "absences.txt"), "w") as fp:
-    fp.write(os.linesep.join([" ".join([str(len(d_exists[act]))]+[str(x) for x in d_exists[act]]+[act]) for act in d_absences]))
\ No newline at end of file
diff --git a/polyadic_preprocessing/external/__init__.py b/polyadic_preprocessing/external/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/polyadic_preprocessing/external/rocket_training.py b/polyadic_preprocessing/external/rocket_training.py
new file mode 100644
index 00000000..1ea82a4f
--- /dev/null
+++ b/polyadic_preprocessing/external/rocket_training.py
@@ -0,0 +1,9 @@
+import numpy
+from sktime.classification.kernel_based import RocketClassifier
+
+def train(X_train, y_train, X_test, y_test):
+    clf = RocketClassifier(num_kernels=500, use_multivariate="yes")
+    clf.fit(X_train, y_train)
+    acc = numpy.sum(clf.predict(X_test) - y_test)/len(y_test)
+    print(acc)
+
diff --git a/polyadic_preprocessing/external/windowshap/README.md b/polyadic_preprocessing/external/windowshap/README.md
new file mode 100644
index 00000000..4ed91a88
--- /dev/null
+++ b/polyadic_preprocessing/external/windowshap/README.md
@@ -0,0 +1,17 @@
+# WindowSHAP
+## Introduction
+ 
+When working with time-series predictive models, it's crucial to have an explainability method that is suitable for time-series data, computationally efficient, and capable of handling dependencies between sequential data points. WindowSHAP is a framework specifically designed for this purpose.This repository includes the implementation of WindowSHAP framework, an effective explanation method for time-series classifiers. For more information about the method, please refer to the [original paper](https://arxiv.org/abs/2211.06507).
+
+WindowSHAP enhances the explainability of time-series prediction models by reducing the total number of features for which Shapley values must be determined. It does so by combining neighboring time steps into a time window. The framework offers various types of time windows, each with its own advantages. The main contributions of WindowSHAP are:
+
+- Adapting Shapley additive explanations for time-series data in a very efficient way.
+- Introducing variations of WindowSHAP based on different windowing techniques, both for fixed- and variable-length time windows.
+The following figure shows how increasing the window length in the WindowSHAP framework can reduce the runtime of the algorithm exponentially.
+
+![Runtime](RunTime.png)
+
+This tutorial demonstrates how to use the WindowSHAP framework, which provides three distinct algorithms: Stationary, Sliding, and Dynamic WindowSHAP. These algorithms help explain time-series classifiers using Shapley values.
+
+## How to Use WindowSHAP
+To use the WindowSHAP framework, one just need to include the `windowshap.py` file into their working directory. There are three different algorithms included in the package; Stationary WindowSHAP, Sliding WindowSHAP, and Dynamic WindowSHAP. For each of the afored mentioned algorithms, a class is defined in the `windowshap.py`.
diff --git a/polyadic_preprocessing/external/windowshap/__init__.py b/polyadic_preprocessing/external/windowshap/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/polyadic_preprocessing/external/windowshap/windowshap.py b/polyadic_preprocessing/external/windowshap/windowshap.py
new file mode 100644
index 00000000..adaf2252
--- /dev/null
+++ b/polyadic_preprocessing/external/windowshap/windowshap.py
@@ -0,0 +1,444 @@
+import math
+import numpy as np
+import shap
+from copy import deepcopy
+import warnings
+warnings.filterwarnings('ignore')
+
+class StationaryWindowSHAP():
+    '''
+    A class for computing the shapely values for time sereis data. Only the shap values for the first output
+    is reported.
+    
+    Parameters:
+    model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input
+    window_len: The length of the window for the algorithm
+    B_ts: A 3D numpy array of background time series data
+    test_ts: A 3D numpy array of test time series data
+    B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None)
+    B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None)
+    test_mask: A 3D numpy array of test mask data (default: None)
+    test_dem: A 2D numpy array of test demographic data (default: None)
+    model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure.  (default: 'lstm')
+    '''
+    def __init__(self, model, window_len, B_ts, test_ts, B_mask=None, B_dem=None,
+                 test_mask=None, test_dem=None, model_type='lstm'):
+        self.model = model
+        self.window_len = window_len
+        self.num_window = np.ceil(B_ts.shape[1]/self.window_len).astype('int')
+        self.num_background = len(B_ts)
+        self.num_test = len(test_ts)
+        self.background_ts = B_ts
+        self.background_mask = B_mask
+        self.background_dem = B_dem
+        self.test_ts = test_ts
+        self.test_mask = test_mask
+        self.test_dem = test_dem
+        self.model_type = model_type
+        self.ts_phi = None
+        self.dem_phi = None
+        self.explainer = None
+        
+        # Problem sizes
+        self.num_ts_ftr = B_ts.shape[2]
+        self.num_ts_step = B_ts.shape[1]
+        self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1]
+        
+        # Creating all data (background and test together)
+        self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0)
+        self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0)
+        self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0)
+        
+        # Creating converted data for SHAP
+        self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0)
+        self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background)
+        
+        
+    def data_prepare(self, ts_x, dem_x=None, start_idx=0):
+        assert len(ts_x.shape) == 3
+        assert dem_x is None or len(dem_x.shape) == 2
+        dem_len = 0 if dem_x is None else dem_x.shape[1]
+
+        total_num_features = self.num_dem_ftr + self.num_ts_ftr * self.num_window
+
+        x_ = [[i]*total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])]
+
+        return np.array(x_)
+        
+    
+    def wraper_predict(self, x):
+        assert len(x.shape) == 2
+
+        dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy()
+
+        # initializing the value of all arrays
+        ts_x_ = np.zeros((x.shape[0], self.all_ts.shape[1], self.all_ts.shape[2]))
+        mask_x_ = np.zeros_like(ts_x_)
+        dem_x_ = np.zeros_like(dem_x, dtype=float)
+        tstep = np.ones((x.shape[0], self.all_ts.shape[1], 1)) * \
+                    np.reshape(np.arange(0, self.all_ts.shape[1]), (1, self.all_ts.shape[1], 1))
+
+        # Reshaping the ts indices based on the num time windows and features
+        ts_x = ts_x.reshape((ts_x.shape[0], self.num_window, self.num_ts_ftr))
+
+        for i in range(x.shape[0]):
+            # creating time series data
+            for t in range(self.num_ts_step):
+                for j in range(self.num_ts_ftr):
+                    # Finding the corresponding time interval
+                    wind_t = np.ceil((t+1)/self.window_len).astype('int') - 1
+                    ind = ts_x[i, wind_t, j]
+                    ts_x_[i, t, j] = self.all_ts[ind, t, j]
+                    mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j]
+            # creating static data
+            for j in range(dem_x.shape[1]):
+                ind = dem_x[i,j]
+                dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j]
+
+        # Creating the input of the model based on the different models. 
+        # This part should be updated as new models get involved in the project
+        if self.model_type == 'lstm_dem':
+            model_input = [ts_x_, dem_x_]
+        elif self.model_type == 'grud':
+            model_input = [ts_x_, mask_x_, tstep]
+        elif self.model_type == 'lstm':
+            model_input = ts_x_
+        
+        return self.model.predict(model_input)
+    
+    def shap_values(self, num_output=1):
+        self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data)
+        shap_values = self.explainer.shap_values(self.test_data)
+        shap_values = np.array(shap_values)
+        
+        self.dem_phi = shap_values[:, :, :self.num_dem_ftr]
+        ts_shap_values = shap_values[:, :, self.num_dem_ftr:]
+        self.ts_phi = ts_shap_values.reshape((num_output, self.num_test, self.num_window, self.num_ts_ftr))
+        
+        # assign values to each single time step by deviding the values by window length
+        self.ts_phi = np.repeat(self.ts_phi/self.window_len, self.window_len, axis=2)[:,:,:self.num_ts_step,:]
+        
+        # Reporting only the first output
+        self.ts_phi = self.ts_phi[0]
+        self.dem_phi = self.dem_phi[0]
+
+        return self.ts_phi if self.num_dem_ftr==0 else (self.dem_phi, self.ts_phi)
+    
+
+class SlidingWindowSHAP():
+    '''
+    A class for computing the shapely values for time sereis data. Only the shap values for the first output
+    is reported.
+    
+    Parameters:
+    model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input
+    stride: The stride parameter for the Sliding WindowSHAP algorithm
+    window_len: The length of the window for the algorithm
+    B_ts: A 3D numpy array of background time series data
+    test_ts: A 3D numpy array of test time series data
+    B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None)
+    B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None)
+    test_mask: A 3D numpy array of test mask data (default: None)
+    test_dem: A 2D numpy array of test demographic data (default: None)
+    model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure.  (default: 'lstm')
+    '''
+    def __init__(self, model, stride, window_len, B_ts, test_ts, B_mask=None,
+                 B_dem=None, test_mask=None, test_dem=None, model_type='lstm'):
+        self.model = model
+        self.model_type = model_type
+        self.stride = stride
+        self.window_len = window_len
+        self.num_window = 2 #Specific to the sliding time window
+        self.num_background = len(B_ts)
+        self.num_test = len(test_ts)
+        self.background_ts = B_ts
+        self.background_mask = B_mask
+        self.background_dem = B_dem
+        self.test_ts = test_ts
+        self.test_mask = test_mask
+        self.test_dem = test_dem
+        self.ts_phi = None
+        self.dem_phi = None
+        self.explainer = None
+        
+        # Problem sizes
+        self.num_ts_ftr = B_ts.shape[2]
+        self.num_ts_step = B_ts.shape[1]
+        self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1]
+        
+        
+        # Creating all data (background and test together)
+        self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0)
+        self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0)
+        self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0)
+        
+        # Creating converted data for SHAP
+        self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0)
+        self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background)
+    
+    def data_prepare(self, ts_x, dem_x=None, start_idx=0):
+        # Modified for sliding time window
+        assert len(ts_x.shape) == 3
+        assert dem_x is None or len(dem_x.shape) == 2
+
+        total_num_features = self.num_dem_ftr + self.num_ts_ftr * self.num_window
+
+        x_ = [[i] * total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])]
+
+        return np.array(x_)
+    
+    def wraper_predict(self, x, start_ind=0):
+        assert len(x.shape) == 2
+        
+        # Calculating the indices inside the time window
+        inside_ind = list(range(start_ind, start_ind + self.window_len))
+        
+        dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy()
+
+        # initializing the value of all arrays
+        ts_x_ = np.zeros((x.shape[0], self.num_ts_step, self.num_ts_ftr))
+        mask_x_ = np.zeros_like(ts_x_)
+        dem_x_ = np.zeros_like(dem_x, dtype=float)
+        tstep = np.ones((x.shape[0], self.num_ts_step, 1)) * \
+                    np.reshape(np.arange(0, self.num_ts_step), (1, self.num_ts_step, 1))
+
+        # Reshaping the ts indices based on the num time windows and features
+        ts_x = ts_x.reshape((ts_x.shape[0], self.num_window, self.num_ts_ftr))
+
+        for i in range(x.shape[0]):
+            # creating time series data
+            for t in range(self.num_ts_step):
+                for j in range(self.num_ts_ftr):
+                    # Finding the corresponding time interval
+                    wind_t = 0 if (t in inside_ind) else 1
+                    ind = ts_x[i, wind_t, j]
+                    ts_x_[i, t, j] = self.all_ts[ind, t, j]
+                    mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j]
+            # creating static data
+            for j in range(dem_x.shape[1]):
+                ind = dem_x[i,j]
+                dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j]
+        
+        # Creating the input of the model based on the different models. 
+        # This part should be updated as new models get involved in the project
+        if self.model_type == 'lstm_dem':
+            model_input = [ts_x_, dem_x_]
+        elif self.model_type == 'grud':
+            model_input = [ts_x_, mask_x_, tstep]
+        elif self.model_type == 'lstm':
+            model_input = ts_x_
+        
+        return self.model.predict(model_input)
+    
+    def shap_values(self, num_output=1, nsamples='auto'):
+        # Initializing number of time windows and contribution score matrices
+        seq_len = self.background_ts.shape[1]
+        num_sw = np.ceil((seq_len - self.window_len)/self.stride).astype('int') + 1
+        ts_phi = np.zeros((self.num_test, num_sw, 2, self.background_ts.shape[2]))
+        dem_phi = np.zeros((self.num_test, num_sw, self.num_dem_ftr))
+        
+        # Determining the number of samples
+        if nsamples=='auto':
+            nsamples = 10 * self.num_ts_ftr + 5 * self.num_dem_ftr
+        
+        # Main loop on different possible windows
+        for stride_cnt in range(num_sw):
+    
+            predict = lambda x: self.wraper_predict(x, start_ind=stride_cnt * self.stride)
+
+            # Running SHAP
+            self.explainer = shap.KernelExplainer(predict, self.background_data)
+            shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples)
+            shap_values = np.array(shap_values)
+
+            # Extracting the SHAP values and storing them
+            dem_shap_values_ = shap_values[:, :, :self.num_dem_ftr]
+            ts_shap_values = shap_values[:, :, self.num_dem_ftr:]
+            ts_shap_values = ts_shap_values.reshape((num_output, self.num_test, 2, self.num_ts_ftr))
+
+            ts_phi[:, stride_cnt, :, :] = ts_shap_values[0]
+            dem_phi[:, stride_cnt, :] = dem_shap_values_[0]
+            
+        # Averaging shap values from different windows
+        ts_phi_agg = np.empty((self.num_test, num_sw, self.num_ts_step, self.num_ts_ftr))
+        ts_phi_agg[:] = np.nan
+        for k in range(num_sw):
+            ts_phi_agg[:,k, k * self.stride:k * self.stride + self.window_len, :] = ts_phi[:, k, 0, :][:, np.newaxis, :]
+        ts_phi_agg = np.nanmean(ts_phi_agg, axis=1)
+        dem_phi = np.nanmean(dem_phi, axis=1)
+        
+        self.dem_phi = dem_phi
+        self.ts_phi = ts_phi_agg
+        
+        return ts_phi_agg if self.num_dem_ftr==0 else (dem_phi, ts_phi_agg)
+
+
+class DynamicWindowSHAP():
+    '''
+    A class for computing the shapely values for time sereis data. Only the shap values for the first output
+    is reported.
+    
+    Parameters:
+    model: A model object that will be used for prediction. The model object must have a method called predict() which produces the model output for a given input
+    delta: The treshold value in Dynamic WindowSHAP algorithm
+    n_w: The maximum allowed number of time windows for each variable
+    B_ts: A 3D numpy array of background time series data
+    test_ts: A 3D numpy array of test time series data
+    B_mask: A 3D numpy array of background masking data. It is only used for specific models such as GRUD where a masking variable is passed to the model alongside the time series data. (default: None)
+    B_dem: A 2D numpy array of background demographic data (non-temporal data). It is only used for specific models with both modelities of temporal and non-temporal variables. (default: None)
+    test_mask: A 3D numpy array of test mask data (default: None)
+    test_dem: A 2D numpy array of test demographic data (default: None)
+    model_type: The type of model being used. Set the parameter to 'lstm' when time series data is the only input, pick 'lstm_dem' when input includes both time sereis and demographic (non-termporal) data, and 'grud' when you are using GRUD structure.  (default: 'lstm')
+    '''
+    def __init__(self, model, delta, n_w, B_ts, test_ts, B_mask=None, B_dem=None,
+                 test_mask=None, test_dem=None, model_type='lstm'):
+        self.model = model
+        self.model_type = model_type
+        self.num_background = len(B_ts)
+        self.num_test = len(test_ts)
+        self.background_ts = B_ts
+        self.background_mask = B_mask
+        self.background_dem = B_dem
+        self.test_ts = test_ts
+        self.test_mask = test_mask
+        self.test_dem = test_dem
+        self.ts_phi = None
+        self.dem_phi = None
+        self.explainer = None
+        
+        # Problem sizes
+        self.num_ts_ftr = B_ts.shape[2]
+        self.num_ts_step = B_ts.shape[1]
+        self.num_dem_ftr = 0 if B_dem is None else B_dem.shape[1]
+        
+        ## Specific to Binary Time Window
+        assert self.num_test == 1 # For binary time window algorithm, samples should be fed to the algorithm one-by-one
+        self.delta = delta
+        self.n_w = n_w
+        self.split_points = [[self.num_ts_step - 1]] * self.num_ts_ftr # Splitting points
+        self.num_window = [1] * self.num_ts_ftr
+        
+        
+        # Creating all data (background and test together)
+        self.all_ts = np.concatenate((self.background_ts, self.test_ts), axis=0)
+        self.all_mask = None if test_mask is None else np.concatenate((self.background_mask, self.test_mask), axis=0)
+        self.all_dem = None if test_dem is None else np.concatenate((self.background_dem, self.test_dem), axis=0)
+        
+        # Creating converted data for SHAP
+        self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0)
+        self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background)
+    
+    def data_prepare(self, ts_x, dem_x=None, start_idx=0):
+        assert len(ts_x.shape) == 3
+        assert dem_x is None or len(dem_x.shape) == 2
+        total_num_features = self.num_dem_ftr + sum(self.num_window) ## Specific to Binary Time Window
+        
+        x_ = [[i] * total_num_features for i in range(start_idx, start_idx + ts_x.shape[0])]
+
+        return np.array(x_)
+    
+    def wraper_predict(self, x):
+        assert len(x.shape) == 2
+        
+        dem_x, ts_x = x[:, :self.num_dem_ftr].copy(), x[:, self.num_dem_ftr:].copy()
+
+        # initializing the value of all arrays
+        ts_x_ = np.zeros((x.shape[0], self.num_ts_step, self.num_ts_ftr))
+        mask_x_ = np.zeros_like(ts_x_)
+        dem_x_ = np.zeros_like(dem_x, dtype=float)
+        tstep = np.ones((x.shape[0], self.num_ts_step, 1)) * \
+                    np.reshape(np.arange(0, self.num_ts_step), (1, self.num_ts_step, 1))
+
+        # Reshaping the ts indices based on the time windows for each feature
+        ## Specific to Binary Time Window
+        temp_ts_x = np.zeros((ts_x.shape[0], max(self.num_window), self.num_ts_ftr), dtype=int)
+        for i in range(self.num_ts_ftr):
+            temp_ts_x[:, :self.num_window[i], i] = ts_x[:, sum(self.num_window[:i]):sum(self.num_window[:i+1])]
+        ts_x = temp_ts_x
+
+        for i in range(x.shape[0]):
+            # creating time series data
+            for j in range(self.num_ts_ftr):
+                # Finding the corresponding time interval
+                wind_t = np.searchsorted(self.split_points[j], np.arange(self.num_ts_step)) ## Specific to Binary Time Window
+                for t in range(self.num_ts_step):
+                    ind = ts_x[i, wind_t[t], j]
+                    ts_x_[i, t, j] = self.all_ts[ind, t, j]
+                    mask_x_[i, t, j] = None if self.all_mask is None else self.all_mask[ind, t, j]
+            # creating static data
+            for j in range(dem_x.shape[1]):
+                ind = dem_x[i,j]
+                dem_x_[i, j] = None if self.all_dem is None else self.all_dem[ind, j]
+        
+        # Creating the input of the model based on the different models. 
+        # This part should be updated as new models get involved in the project
+        if self.model_type == 'lstm_dem':
+            model_input = [ts_x_, dem_x_]
+        elif self.model_type == 'grud':
+            model_input = [ts_x_, mask_x_, tstep]
+        elif self.model_type == 'lstm':
+            model_input = ts_x_
+        
+        return self.model.predict(model_input)
+    
+    def shap_values(self, num_output=1, nsamples_in_loop='auto', nsamples_final='auto'):
+        flag = 1
+        while flag:
+            flag = 0
+            # Updating the number of time windows for each time series feature
+            self.num_window = [len(self.split_points[i]) for i in range(self.num_ts_ftr)]
+            
+            # Updating converted data for SHAP
+            self.background_data = self.data_prepare(ts_x=self.background_ts, dem_x=self.background_dem, start_idx=0)
+            self.test_data = self.data_prepare(ts_x=self.test_ts, dem_x=self.test_dem, start_idx=self.num_background)
+
+            # Running SHAP
+            if nsamples_in_loop == 'auto':
+                nsamples = 2 * sum(self.num_window)
+            else:
+                nsamples = nsamples_in_loop
+            
+            self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data)
+            shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples)
+            shap_values = np.array(shap_values)
+            dem_phi = shap_values[0, :, :self.num_dem_ftr] # Extracting dem SHAP values
+            ts_shap_values = shap_values[:, :, self.num_dem_ftr:] # Extracting ts SHAP values
+            
+            # Checking the maximum number of windows condition
+            if max(self.num_window) >= self.n_w: break
+            
+            for i in range(self.num_ts_ftr):
+                S = set(self.split_points[i])
+                for j in range(self.num_window[i]):
+                    if abs(ts_shap_values[0, 0, sum(self.num_window[:i]) + j]) > self.delta:
+                        S.add(int(self.split_points[i][j]/2) if j == 0 else int((self.split_points[i][j-1] + self.split_points[i][j])/2))
+                if set(S) != set(self.split_points[i]):
+                    flag += 1
+                    self.split_points[i] = list(S)
+                    self.split_points[i].sort()
+        
+        # Running SHAP with large number of samples for the final evaluation of Shapely values
+        self.explainer = shap.KernelExplainer(self.wraper_predict, self.background_data)
+        shap_values = self.explainer.shap_values(self.test_data, nsamples=nsamples_final)
+        shap_values = np.array(shap_values)
+        dem_phi = shap_values[0, :, :self.num_dem_ftr] # Extracting dem SHAP values
+        ts_shap_values = shap_values[:, :, self.num_dem_ftr:] # Extracting ts SHAP values
+        
+        # Assigning Shap values to each single time step
+        ts_phi = np.zeros((self.num_test, self.num_ts_step, self.num_ts_ftr))
+        for i in range(self.num_ts_ftr):
+            for j in range(self.num_window[i]):
+                # This part of the code is written in a way that each splitting point belongs to the time window that starts from that point
+                # For the last time window, both splitting points at the end and start of the time window belong to it
+                start_ind = 0 if j==0 else self.split_points[i][j-1]
+                end_ind = self.split_points[i][j] + int((j + 1) / self.num_window[i])
+                ts_phi[0, start_ind:end_ind, i] = ts_shap_values[0, :, sum(self.num_window[:i]) + j] / (end_ind - start_ind)
+        self.dem_phi = dem_phi
+        self.ts_phi = ts_phi
+        
+        return ts_phi if self.num_dem_ftr==0 else (dem_phi, ts_phi)
+
+
+
+if __name__=="__main__":
+    pass
diff --git a/polyadic_preprocessing/learn_from_cpp_csvs.py b/polyadic_preprocessing/learn_from_cpp_csvs.py
deleted file mode 100644
index 3f28abfd..00000000
--- a/polyadic_preprocessing/learn_from_cpp_csvs.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-import sys
-
-import pandas
-from sklearn.metrics import accuracy_score
-from sklearn.model_selection import train_test_split
-from sklearn.tree import DecisionTreeClassifier
-
-from utils import export_text2
-
-class0 = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/output_csv_0.csv"
-class1 = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/removed_non_rilevazione/output_csv_1.csv"
-spec = None#"/home/giacomo/projects/knobab2_loggen/output_model_healthcare/debugged/log_weekly.json_0.2_0_0_0_clazz=1.txt"
-modelfile = "decl_nodata_100model_1-1.txt"
-
-def loadDataset(class0, class1):
-    df0 = pandas.read_csv(class0, index_col=0, header=None).transpose()
-    df0['class'] = 0
-    df1 = pandas.read_csv(class1, index_col=0, header=None).transpose()
-    df1['class'] = 1
-    return pandas.concat([df0, df1], axis=0, ignore_index=True).fillna(-1)
-
-def readFileForSpec(filename):
-    S = set()
-    with open(filename, "r") as f:
-        for line in f.readlines():
-            S.add(line)
-            coex = line.find("CoExistence(")
-            cho = line.find("Choice(")
-            excl = line.find("ExclChoice(")
-            firstOpen = line.find('(')
-            lastClose = line.rfind(')')
-            if (coex==0) or cho==0 or excl==0:
-                comma = line.find(',')
-                comma2 = line.rfind(',')
-                if (comma == comma2):
-                    act1 = line[firstOpen + 1:comma].strip()
-                    act2 = line[comma + 1:lastClose].strip()
-                    if (coex==0):
-                        S.add("CoExistence("+act2+","+act1+")")
-                    elif (cho==0):
-                        S.add("Choice("+act2+","+act1+")")
-                    elif (excl==1):
-                        S.add("ExclChoice("+act2+","+act1+")")
-                else:
-                    sys.exit(1)
-    return S
-
-dict_list = loadDataset(class0, class1)
-
-if spec is not None:
-    S = readFileForSpec(spec)
-    dict_list = dict_list[list(set(dict_list.columns).intersection(S))+["class"]]
-
-
-
-if dict_list.empty or (len(set(dict_list.columns)) == 1 and ("class" in set(dict_list.columns))):
-    print("No data")
-    with open(modelfile, "w") as file:
-        file.write("No data")
-else:
-    X = dict_list.drop(labels=['class'], axis=1)
-    y = dict_list['class']
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
-    rf = DecisionTreeClassifier(criterion="entropy")
-    rf.fit(X_train, y_train)
-    y_pred = rf.predict(X_test)
-    accuracy = accuracy_score(y_test, y_pred)
-    print("Accuracy:", accuracy)
-    with open(modelfile, "w") as file:
-        file.write(os.linesep.join(export_text2(rf, X.columns, show_weights=True)))
-        file.write(os.linesep + ("Accuracy: ") + str(accuracy))
\ No newline at end of file
diff --git a/polyadic_preprocessing/log_json_to_numpy.py b/polyadic_preprocessing/log_json_to_numpy.py
new file mode 100644
index 00000000..7694e8a4
--- /dev/null
+++ b/polyadic_preprocessing/log_json_to_numpy.py
@@ -0,0 +1,145 @@
+import json
+import os
+import sys
+from collections import defaultdict
+
+import pandas
+from sklearn.metrics import precision_score
+
+
+def transform_entry(e, S, minsplit):
+    torem = list(set(filter(lambda x : x.endswith("_a") or x.endswith("_s") or x.endswith("_v") or x.endswith("_i"), S)))
+    sc = set(e.columns)
+    for x in S:
+        if x not in sc:
+            from sympy.physics.continuum_mechanics.beam import numpy
+            e[x] = numpy.nan
+    e.drop(torem, axis=1,inplace=True)
+    e.fillna(0,inplace=True)
+    if minsplit>0:
+        for i in range(len(e) - minsplit):
+            tmp = e.loc[i:i+minsplit, :]
+            # yield pandas.MultiIndex.from_frame(tmp)
+            yield {x: pandas.Series(tmp[x].to_numpy().astype(float)) for x in sorted(set(tmp.columns).difference(torem))}
+            #numpy.array([pandas.Series(tmp[x].to_numpy().astype(float)) for x in sorted(set(tmp.columns).difference(torem))])
+    else:
+        # yield pandas.MultiIndex.from_frame(e)
+        yield {x: pandas.Series(e[x].to_numpy().astype(float)) for x in sorted(set(e.columns).difference(torem))}
+
+def trasform_user_class(v, S, minsplit):
+    for x in v:
+        yield from transform_entry(x, S, minsplit)
+
+
+def transform_users(u, S, minsplit):
+    for k, v in u.items():
+        yield (list(trasform_user_class(v, S, minsplit)), [k] * len(v))
+    # return {k: list(trasform_user_class(v, S, minsplit)) for k, v in u.items()}
+
+def transform_all(L, S, minsplit):
+    for u in L:
+        yield from transform_users(u, S, minsplit)
+
+def classifier(name):
+    if name == 'Rocket':
+        from sktime.classification.kernel_based import RocketClassifier
+        return RocketClassifier(num_kernels=500, use_multivariate="yes")
+    elif name == 'TapNet':
+        from sktime.classification.deep_learning.tapnet import TapNetClassifier
+        return TapNetClassifier(n_epochs=20,batch_size=4)
+    elif name == 'EuclideanKNN':
+        from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
+        return KNeighborsTimeSeriesClassifier(distance="euclidean")
+    elif name == 'CanonicalIntervalForest':
+        from sktime.classification.interval_based import CanonicalIntervalForest
+        return CanonicalIntervalForest(n_estimators=3, n_intervals=2, att_subsample_size=2)
+    elif name == 'ShapeletTransformClassifier':
+        from sktime.classification.shapelet_based import ShapeletTransformClassifier
+        return ShapeletTransformClassifier()
+    elif name == 'SignatureClassifier':
+        from sktime.classification.feature_based import SignatureClassifier
+        return SignatureClassifier()
+
+classifiers = ['Rocket', 'TapNet', 'EuclideanKNN', 'CanonicalIntervalForest', 'ShapeletTransformClassifier']
+
+def transform(L, S, minsplit):
+    yL = []
+    xL = []
+    for x, y in transform_all(L, S, minsplit):
+        for xs, ys in zip(x,y):
+            xL.append(xs)
+            yL.append(ys)
+    import numpy
+    xL = pandas.DataFrame(xL)
+    yL = numpy.array(yL)
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(xL, yL, test_size=0.3, stratify=yL)
+    d = dict()
+    p = dict()
+    for name in classifiers:
+        clf = classifier(name)
+        try:
+            clf.fit(X_train, y_train)
+            y_pred = clf.predict(X_test)
+            from sklearn.metrics import accuracy_score
+            accuracy = accuracy_score(y_test, y_pred)
+            precision = precision_score(y_test, y_pred)
+        except:
+            accuracy = 0
+            precision = 0
+        print(f"{name}: Accuracy={accuracy}, Precision={precision}")
+        d[name] = accuracy
+        p[name] = precision
+    return (d, p)
+
+def log_json_to_numpy(filename_json, dominsplit=False):
+    data = None
+    if not os.path.exists(filename_json):
+        return None
+    with open(filename_json, "r") as f:
+        data = json.load(f)
+    data = data["log"]
+    S = set()
+    userL = []
+    minsplit = sys.maxsize
+    for user in data:
+        prevClass = None
+        df = defaultdict(list)
+        buildup = []
+        for event in user["__events"]:
+            payload = event[0]
+            __class = 1 if payload.pop("__class") == "Ok" else 0
+            payload.pop("__label")
+            payload.pop("day")
+            payload.pop("time")
+            payload.pop("fulltime")
+            payload["class"] = __class
+            if prevClass == None:
+                prevClass = __class
+            if prevClass == __class:
+                buildup.append(payload)
+            else:
+                minsplit = min(minsplit, len(buildup))
+                df[prevClass].append(pandas.DataFrame(buildup))
+                prevClass= __class
+            S = S.union(set(payload.keys()))
+        minsplit = min(minsplit, len(buildup))
+        df[prevClass].append(pandas.DataFrame(buildup))
+        userL.append(df)
+    if dominsplit is False:
+        minsplit = -1
+    return transform(userL, S, minsplit)
+
+
+
+
+
+if __name__ == "__main__":
+    aL = []
+    pL = []
+    for _ in range(20):
+        a, p  = log_json_to_numpy("/home/giacomo/projects/sdd-processing/sdd-processing/log_weekly.json", dominsplit=True)
+        aL.append(a)
+        pL.append(p)
+    pandas.DataFrame(aL).to_csv("accuracy.csv")
+    pandas.DataFrame(pL).to_csv("precision.csv")
diff --git a/polyadic_preprocessing/medical_analysis.py b/polyadic_preprocessing/medical_analysis.py
index b542fa3c..0be40de8 100644
--- a/polyadic_preprocessing/medical_analysis.py
+++ b/polyadic_preprocessing/medical_analysis.py
@@ -23,7 +23,7 @@
 from timeseries.MultiTraceIndexing import MultiTraceIndexing
 from timeseries.SequentialPatternMining import SequentialPatternMining, MiningConfiguration
 from timeseries.TimeSeriesMining import mine_binary_growth_patterns
-from prefixspan import PrefixSpan
+# from prefixspan import PrefixSpan
 
 def extendDictionaryWithTime(d):
     t = d.log.projectProperties("time", lambda l: min(filter(lambda x : isinstance(x, str), l)))
diff --git a/polyadic_preprocessing/crawl_results_model_stats.py b/polyadic_preprocessing/stats_crawl_results_model_stats.py
similarity index 98%
rename from polyadic_preprocessing/crawl_results_model_stats.py
rename to polyadic_preprocessing/stats_crawl_results_model_stats.py
index 2333391c..3b844493 100644
--- a/polyadic_preprocessing/crawl_results_model_stats.py
+++ b/polyadic_preprocessing/stats_crawl_results_model_stats.py
@@ -4,10 +4,10 @@
 import pandas
 import math
 
-supps = set()
-reds = set()
-poly = set()
-rec = set()
+# supps = set()
+# reds = set()
+# poly = set()
+# rec = set()
 filename_fileds = ["mining_supp","reduction","isFilenamePolyadic","reclassify"]
 S = set(["Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence", "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"])
 root_dir = "/home/giacomo/projects/knobab2_loggen/output_model_healthcare/"
diff --git a/polyadic_preprocessing/utils.py b/polyadic_preprocessing/utils.py
index 94417558..ffdc934a 100644
--- a/polyadic_preprocessing/utils.py
+++ b/polyadic_preprocessing/utils.py
@@ -19,6 +19,42 @@ def time_in_range(start, delta, x):
         return start <= x or x <= end
 
 
+class ForParsing:
+    def __init__(self, root_dir, filename_fileds=None, S=None, untimed=None):
+        self.filename_fileds = filename_fileds
+        self.root_dir = root_dir
+        if self.filename_fileds is None:
+            self.filename_fileds = ["mining_supp", "reduction", "isFilenamePolyadic", "reclassify"]
+        else:
+            self.filename_fileds = list(self.filename_fileds)
+        self.S = S
+        if self.S is None:
+            self.S = {"Choice", "RespExistence", "Response", "ChainResponse", "Precedence", "ChainPrecedence", "CoExistence",
+             "Succession", "ChainSuccession", "Init", "End", "Exists", "Absence", "Choice", "ExclChoice"}
+        else:
+            self.S = set(self.S)
+        self.untimed = untimed
+        if self.untimed is None:
+            self.untimed = {"Choice", "RespExistence", "CoExistence", "Choice", "ExclChoice"}
+        else:
+            self.untimed = set(self.untimed)
+
+    def yielder(self, ff):
+        import glob
+        from pathlib import Path
+        for filename in glob.iglob(self.root_dir + '**/*.txt', recursive=True):
+            with open(filename, 'r') as f:
+                stem = Path(filename).stem.split("_")
+                clazz = stem[-1].replace("clazz=", "")
+                stem = stem[:-1]
+                d = dict(zip(self.filename_fileds, stem[-len(self.filename_fileds):]))
+                # dtmp = dict()
+                d["class"] = clazz
+                d["filename"] = "_".join(stem[:-len(self.filename_fileds)])
+                key = tuple(stem[-len(self.filename_fileds):])
+                ff(filename, key, d)
+
+
 
 def export_text2(decision_tree, feature_names=None,
                 spacing=3, decimals=5, show_weights=False):