From eb07195ea8bcd79c387c909ede43919957c2344e Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Thu, 14 Nov 2024 12:25:21 -0800
Subject: [PATCH 1/7] feat: Added CSV parser as a mock image dataset

---
 package-lock.json                            |  16 +
 package.json                                 |   2 +
 src/state/image-dataset/csv-dataset/index.ts | 397 +++++++++++++++++++
 3 files changed, 415 insertions(+)
 create mode 100644 src/state/image-dataset/csv-dataset/index.ts

diff --git a/package-lock.json b/package-lock.json
index 91ada186..de7bee52 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,6 +19,7 @@
                 "firebase": "^7.24.0",
                 "history": "^4.7.2",
                 "lodash": "^4.17.20",
+                "papaparse": "^5.4.1",
                 "plotly.js": "^2.35.2",
                 "pram": "^2.0.0-alpha.0",
                 "react": "^16.14.0",
@@ -44,6 +45,7 @@
                 "@types/lodash": "^4.14.176",
                 "@types/mini-css-extract-plugin": "^2.2.0",
                 "@types/mocha": "^2.2.44",
+                "@types/papaparse": "^5.3.15",
                 "@types/plotly.js": "^1.54.10",
                 "@types/postcss-flexbugs-fixes": "^5.0.3",
                 "@types/react": "^16.14.0",
@@ -4302,6 +4304,15 @@
                 "@types/node": "*"
             }
         },
+        "node_modules/@types/papaparse": {
+            "version": "5.3.15",
+            "resolved": "https://registry.npmjs.org/@types/papaparse/-/papaparse-5.3.15.tgz",
+            "integrity": "sha512-JHe6vF6x/8Z85nCX4yFdDslN11d+1pr12E526X8WAfhadOeaOTx5AuIkvDKIBopfvlzpzkdMx4YyvSKCM9oqtw==",
+            "dev": true,
+            "dependencies": {
+                "@types/node": "*"
+            }
+        },
         "node_modules/@types/parse-json": {
             "version": "4.0.2",
             "dev": true,
@@ -11804,6 +11815,11 @@
             "resolved": "https://registry.npmjs.org/pako/-/pako-2.1.0.tgz",
             "integrity": "sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug=="
         },
+        "node_modules/papaparse": {
+            "version": "5.4.1",
+            "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz",
+            "integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw=="
+        },
         "node_modules/param-case": {
             "version": "3.0.4",
             "dev": true,
diff --git a/package.json b/package.json
index d88a1cbe..e695b3a9 100755
--- a/package.json
+++ b/package.json
@@ -38,6 +38,7 @@
         "@types/lodash": "^4.14.176",
         "@types/mini-css-extract-plugin": "^2.2.0",
         "@types/mocha": "^2.2.44",
+        "@types/papaparse": "^5.3.15",
         "@types/plotly.js": "^1.54.10",
         "@types/postcss-flexbugs-fixes": "^5.0.3",
         "@types/react": "^16.14.0",
@@ -92,6 +93,7 @@
         "firebase": "^7.24.0",
         "history": "^4.7.2",
         "lodash": "^4.17.20",
+        "papaparse": "^5.4.1",
         "plotly.js": "^2.35.2",
         "pram": "^2.0.0-alpha.0",
         "react": "^16.14.0",
diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
new file mode 100644
index 00000000..7f6647d7
--- /dev/null
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -0,0 +1,397 @@
+import { ViewerChannelSettings } from "@aics/web-3d-viewer";
+import { Album } from "../..";
+import * as Papa from "papaparse";
+import {
+    DataForPlot,
+    MeasuredFeatureDef,
+    FileInfo,
+    PerCellLabels,
+    MeasuredFeaturesOption,
+    DiscreteMeasuredFeatureDef,
+    ContinuousMeasuredFeatureDef,
+} from "../../metadata/types";
+import { ImageDataset, InitialDatasetSelections, Megaset } from "../types";
+import firebase from "firebase";
+import {
+    CELL_ID_KEY,
+    FOV_ID_KEY,
+    FOV_THUMBNAIL_PATH,
+    FOV_VOLUME_VIEWER_PATH,
+    GROUP_BY_KEY,
+    THUMBNAIL_PATH,
+    TRANSFORM,
+    VOLUME_VIEWER_PATH,
+} from "../../../constants";
+
+export const DEFAULT_CSV_DATASET_KEY = "csv";
+const reservedKeys = new Set([
+    CELL_ID_KEY,
+    FOV_ID_KEY,
+    FOV_THUMBNAIL_PATH,
+    FOV_VOLUME_VIEWER_PATH,
+    THUMBNAIL_PATH,
+    VOLUME_VIEWER_PATH,
+    TRANSFORM,
+]);
+
+// From Adobe categorical colors
+const DEFAULT_COLORS = [
+    "#27B4AE",
+    "#4047C4",
+    "#F48730",
+    "#DB4281",
+    "#7E84F4",
+    "#78DF76",
+    "#1C7AED",
+    "#7129CD",
+    "#E7C73B",
+    "#C95F1E",
+    "#188E61",
+    "#BEE952",
+];
+
+function isNumeric(value: string): boolean {
+    if (typeof value != "string") {
+        return false;
+    }
+    return !isNaN(Number(value)) && !isNaN(parseFloat(value));
+}
+
+function isStringArray(data: string[] | (number | null)[]): data is string[] {
+    return data.length > 0 && typeof data[0] === "string";
+}
+
+const enum FeatureType {
+    CONTINUOUS,
+    DISCRETE,
+}
+
+type FeatureInfo =
+    | {
+          type: FeatureType.CONTINUOUS;
+          def: ContinuousMeasuredFeatureDef;
+          data: (number | null)[];
+      }
+    | {
+          type: FeatureType.DISCRETE;
+          def: DiscreteMeasuredFeatureDef;
+          data: (number | null)[];
+      };
+
+/**
+ * Parses and mocks an ImageDataset from a provided CSV string with a header row.
+ *
+ * The CSV must contain:
+ * - URL paths to volume data, under the column name "volumeViewerPath"
+ *
+ * Optionally, the CSV can contain:
+ * - Thumbnail paths, under the column name "thumbnailPath"
+ * - Cell IDs, under the column name "CellId"
+ * - FOV IDs, under the column name "FOVId"
+ * - FOV thumbnail paths, under the column name "fovThumbnailPath"
+ * - FOV volume data, under the column name "fovVolumeviewerPath"
+ *
+ * Some data will be ignored by default:
+ * - Transform data, under the column name "transform"
+ *
+ * Any other columns will be interpreted as features:
+ * - Columns containing only numbers will be treated as numeric data.
+ * - Columns containing any non-numeric data will be treated as category ("discrete") data.
+ */
+class CsvRequest implements ImageDataset {
+    csvData: Record<string, string>[];
+    idToIndex: Record<string, number>;
+    featureInfo: Map<string, FeatureInfo>;
+
+    defaultGroupByFeatureKey: string;
+
+    constructor(csvFileContents: string) {
+        this.csvData = [];
+        this.idToIndex = {};
+        this.featureInfo = new Map();
+        // TODO: Automatically detect a discrete feature and replace the group
+        // by feature with it.
+        this.defaultGroupByFeatureKey = "CellId";
+        this.parseCsvData(csvFileContents);
+    }
+
+    /**
+     * Returns all of the column names that are not reserved for metadata, with the
+     * assumption that they are features.
+     */
+    private getNonReservedFeatureColumns(csvData: Record<string, string>[]): string[] {
+        const keys = Object.keys(csvData[0]);
+        return keys.filter((key) => !reservedKeys.has(key));
+    }
+
+    /**
+     * Used for default initialization. Returns the feature key at the given index,
+     * clamped to the length of the features array.
+     */
+    private getFeatureKeyClamped(featureKeys: string[], index: number): string {
+        const lastIndex = featureKeys.length - 1;
+        return featureKeys[Math.min(Math.max(index, 0), lastIndex)];
+    }
+
+    /**
+     * Returns the feature data as a map from the feature name to a array of either
+     * numeric or string values.
+     */
+    private getFeatureDataAsColumns(
+        csvData: Record<string, string>[],
+        featureKeys: string[]
+    ): Map<string, string[] | number[]> {
+        const featureData = new Map<string, string[] | number[]>();
+        for (const key of featureKeys) {
+            const rawValues: string[] = [];
+            let isContinuous = true;
+            for (const row of csvData) {
+                rawValues.push(row[key]);
+                if (!isNumeric(row[key])) {
+                    isContinuous = false;
+                }
+            }
+
+            if (isContinuous) {
+                // Feature is continuous, parse all values as numeric
+                const values = rawValues.map((val) => Number.parseFloat(val));
+                featureData.set(key, values);
+            } else {
+                // Feature is discrete, return directly
+                featureData.set(key, rawValues);
+            }
+        }
+        return featureData;
+    }
+
+    private parseDiscreteFeature(
+        key: string,
+        data: string[]
+    ): { def: DiscreteMeasuredFeatureDef; data: (number | null)[] } {
+        const strValueToIndex = new Map<string, { index: number; count: number }>();
+        const remappedValues: (number | null)[] = [];
+
+        // Iterate through all values and count them. Replace the values with their
+        // corresponding index.
+        for (let i = 0; i < data.length; i++) {
+            const value = data[i];
+            let indexInfo = strValueToIndex.get(value);
+            if (!indexInfo) {
+                // Assign new index to this value
+                indexInfo = { index: strValueToIndex.size, count: 0 };
+                strValueToIndex.set(value, indexInfo);
+            }
+
+            indexInfo.count++;
+            remappedValues.push(indexInfo.index);
+        }
+
+        const options: Record<string, MeasuredFeaturesOption> = {};
+        for (const [value, { index, count }] of strValueToIndex.entries()) {
+            options[index.toString()] = {
+                color: DEFAULT_COLORS[index % DEFAULT_COLORS.length],
+                name: value,
+                key: value,
+                count: count,
+            };
+        }
+
+        return {
+            def: {
+                discrete: true,
+                displayName: key,
+                description: key,
+                key,
+                options,
+                tooltip: key,
+            },
+            data: remappedValues,
+        };
+    }
+
+    private parseFeatures(csvData: Record<string, string>[]): void {
+        this.featureInfo.clear();
+
+        const featureKeys = this.getNonReservedFeatureColumns(csvData);
+        const rawFeatureData = this.getFeatureDataAsColumns(csvData, featureKeys);
+
+        for (const key of featureKeys) {
+            const data = rawFeatureData.get(key);
+            if (!data) {
+                continue;
+            }
+            if (isStringArray(data)) {
+                const { def, data: discreteData } = this.parseDiscreteFeature(key, data);
+                this.featureInfo.set(key, {
+                    type: FeatureType.DISCRETE,
+                    def,
+                    data: discreteData,
+                });
+            } else {
+                const def: ContinuousMeasuredFeatureDef = {
+                    discrete: false,
+                    displayName: key,
+                    description: key,
+                    key,
+                    tooltip: key,
+                };
+                this.featureInfo.set(key, {
+                    type: FeatureType.CONTINUOUS,
+                    def,
+                    data: data,
+                });
+            }
+        }
+
+        // TODO: Feature defs can include units. Should we strip that from the feature column name?
+    }
+
+    private parseCsvData(csvDataSrc: string): void {
+        // TODO: handle URLs and files here: they need to be handled via async callbacks.
+        // https://www.papaparse.com/docs#strings
+        const result = Papa.parse(csvDataSrc, { header: true }).data as Record<string, string>[];
+        this.csvData = result as Record<string, string>[];
+
+        if (this.csvData.length === 0) {
+            throw new Error("No data found in CSV");
+        }
+
+        // Map from cell IDs to row index. If no cell ID is provided, assign the row number.
+        for (let i = 0; i < this.csvData.length; i++) {
+            const row = this.csvData[i];
+            if (row[CELL_ID_KEY] !== undefined) {
+                this.idToIndex[row[CELL_ID_KEY]] = i;
+            } else {
+                // Substitute with index if no cell ID is provided
+                row[CELL_ID_KEY] = i.toString();
+                this.idToIndex[i.toString()] = i;
+            }
+        }
+
+        this.parseFeatures(this.csvData);
+    }
+
+    selectDataset(): Promise<InitialDatasetSelections> {
+        const featureKeys = Array.from(this.featureInfo.keys());
+        return Promise.resolve({
+            defaultXAxis: this.getFeatureKeyClamped(featureKeys, 0),
+            defaultYAxis: this.getFeatureKeyClamped(featureKeys, 1),
+            defaultColorBy: this.defaultGroupByFeatureKey,
+            defaultGroupBy: this.defaultGroupByFeatureKey,
+            // TODO: Provide the containing folder of the CSV if the values for the columns (thumbnails,
+            // downloads, volumes) are relative paths and not HTTPS URLs.
+            thumbnailRoot: "",
+            downloadRoot: "",
+            volumeViewerDataRoot: "",
+        });
+    }
+
+    getAvailableDatasets(): Promise<Megaset[]> {
+        // Only has one dataset (imported CSV)
+        const fakeSet: Megaset = {
+            name: "csv",
+            title: "CSV Dataset",
+            production: false,
+            dateCreated: firebase.firestore.Timestamp.now(),
+            datasets: {
+                csv: {
+                    name: "csv",
+                    title: "CSV Dataset",
+                    version: "1",
+                    id: DEFAULT_CSV_DATASET_KEY,
+                    description: "A dataset imported from a CSV file",
+                    index: 0,
+                    userData: {},
+                },
+            },
+        };
+
+        return Promise.resolve([fakeSet]);
+    }
+
+    getViewerChannelSettings(): Promise<ViewerChannelSettings> {
+        // By default, enable first three channels
+        // TODO: Have this constant be exposed by w3cv?
+        return Promise.resolve({
+            groups: [
+                {
+                    name: "Channels",
+                    channels: [
+                        { match: [0, 1, 2], enabled: true },
+                        { match: "(.+)", enabled: false },
+                    ],
+                },
+            ],
+        });
+    }
+
+    private getFeatureKeyToData(): Record<string, (number | null)[]> {
+        const featureKeyToData: Record<string, (number | null)[]> = {};
+        for (const [key, info] of this.featureInfo.entries()) {
+            featureKeyToData[key] = info.data;
+        }
+        return featureKeyToData;
+    }
+
+    getFeatureData(): Promise<DataForPlot | void> {
+        const indices = this.csvData.map((_row, index) => index);
+        const values: Record<string, (number | null)[]> = this.getFeatureKeyToData();
+        const labels: PerCellLabels = {
+            thumbnailPaths: [],
+            cellIds: [],
+        };
+
+        for (let i = 0; i < indices.length; i++) {
+            // TODO: Calculate in advance
+            const row = this.csvData[i];
+            // Copy label data
+            labels.cellIds.push(row[CELL_ID_KEY]);
+            labels.thumbnailPaths.push(row[THUMBNAIL_PATH] || "");
+        }
+
+        return Promise.resolve({
+            indices,
+            values,
+            labels,
+        });
+    }
+
+    getAlbumData(): Promise<Album[]> {
+        return Promise.resolve([]);
+    }
+
+    getMeasuredFeatureDefs(): Promise<MeasuredFeatureDef[]> {
+        const featureDefsArray = Array.from(this.featureInfo.values()).map((info) => info.def);
+        return Promise.resolve(featureDefsArray);
+    }
+
+    getFileInfoByCellId(id: string): Promise<FileInfo | undefined> {
+        const rowIndex = this.idToIndex[id];
+        if (rowIndex === undefined) {
+            return Promise.resolve(undefined);
+        }
+        const data = this.csvData[rowIndex];
+
+        if (!data) {
+            return Promise.resolve(undefined);
+        }
+        const fileInfo = {
+            [CELL_ID_KEY]: data[CELL_ID_KEY] || "",
+            [FOV_ID_KEY]: data[FOV_ID_KEY] || "",
+            [FOV_THUMBNAIL_PATH]: data[FOV_THUMBNAIL_PATH] || "",
+            [FOV_VOLUME_VIEWER_PATH]: data[FOV_VOLUME_VIEWER_PATH] || "",
+            [THUMBNAIL_PATH]: data[THUMBNAIL_PATH] || "",
+            [VOLUME_VIEWER_PATH]: data[VOLUME_VIEWER_PATH] || "",
+            [GROUP_BY_KEY]: data[GROUP_BY_KEY] || this.defaultGroupByFeatureKey,
+        };
+        return Promise.resolve(fileInfo);
+    }
+
+    getFileInfoByArrayOfCellIds(ids: string[]): Promise<(FileInfo | undefined)[]> {
+        const promises = ids.map((id) => this.getFileInfoByCellId(id));
+        const result = Promise.all(promises);
+        return Promise.resolve(result);
+    }
+}
+
+export default CsvRequest;

From a2549991b6e519800f9cc46dd5f65b19e24d543e Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Thu, 14 Nov 2024 12:45:47 -0800
Subject: [PATCH 2/7] feat: Added unit tests, handled NaN values

---
 src/state/image-dataset/csv-dataset/index.ts  |  4 ++
 .../csv-dataset/test/CsvRequest.test.ts       | 39 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts

diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
index 7f6647d7..2216d1a0 100644
--- a/src/state/image-dataset/csv-dataset/index.ts
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -54,6 +54,9 @@ function isNumeric(value: string): boolean {
     if (typeof value != "string") {
         return false;
     }
+    if (value.trim().toLowerCase() === "nan") {
+        return true;
+    }
     return !isNaN(Number(value)) && !isNaN(parseFloat(value));
 }
 
@@ -154,6 +157,7 @@ class CsvRequest implements ImageDataset {
 
             if (isContinuous) {
                 // Feature is continuous, parse all values as numeric
+                // TODO: Handle empty/blank values
                 const values = rawValues.map((val) => Number.parseFloat(val));
                 featureData.set(key, values);
             } else {
diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
new file mode 100644
index 00000000..ed823690
--- /dev/null
+++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
@@ -0,0 +1,39 @@
+import { expect } from "chai";
+import CsvRequest from "..";
+
+const testCsv =
+    "CellId,volumeviewerPath,thumbnailPath,feature1,feature2,feature3,discretefeature" +
+    "\npotato,https://example.com/1/raw.ome.zarr,https://example.com/1.png,1,2,3,A" +
+    "\ngarbanzo,https://example.com/2/raw.ome.zarr,https://example.com/2.jpeg,7,3.4,1,B" +
+    "\nturnip,https://example.com/3/raw.ome.zarr,https://example.com/3.jpeg,4,5,6,B" +
+    "\nrutabaga,https://example.com/4/raw.ome.zarr,https://example.com/4.jpeg,9,2.8,NaN,C";
+
+describe("CsvRequest", () => {
+    it("can be initialized with test data", () => {
+        new CsvRequest(testCsv);
+    });
+
+    it("extracts feature data", async () => {
+        const csvDataset = new CsvRequest(testCsv);
+
+        const featureData = await csvDataset.getFeatureData();
+        expect(featureData).to.deep.equal({
+            indices: [0, 1, 2, 3],
+            values: {
+                feature1: [1, 7, 4, 9],
+                feature2: [2, 3.4, 5, 2.8],
+                feature3: [3, 1, 6, NaN],
+                discretefeature: [0, 1, 1, 2],
+            },
+            labels: {
+                thumbnailPaths: [
+                    "https://example.com/1.png",
+                    "https://example.com/2.jpeg",
+                    "https://example.com/3.jpeg",
+                    "https://example.com/4.jpeg",
+                ],
+                cellIds: ["potato", "garbanzo", "turnip", "rutabaga"],
+            },
+        });
+    });
+});

From 5196cd3f1afbd64f6e6e8aa35d5f48b27c9e0a58 Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Thu, 14 Nov 2024 12:49:46 -0800
Subject: [PATCH 3/7] doc: Added additional TODOs for CSV unit tests

---
 .../image-dataset/csv-dataset/test/CsvRequest.test.ts    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
index ed823690..f2143cac 100644
--- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
+++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
@@ -36,4 +36,13 @@ describe("CsvRequest", () => {
             },
         });
     });
+    /**
+     * TODO:
+     * - Check for spaces in CSV input
+     * - Check for empty values in CSV input
+     * - Check for null/NaN values in CSV input
+     * - Check for behavior when there is no discrete feature column -> validate groupby
+     * - Check for handling of BFF-specific column names (they should be remapped)
+     * - Check that metadata-related columns are not parsed as features
+     */
 });

From 608f0a01e4138f57607122f4d926ea89f864ec4f Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Thu, 14 Nov 2024 13:03:39 -0800
Subject: [PATCH 4/7] refactor: Renamed constants

---
 src/state/image-dataset/csv-dataset/index.ts                | 6 +++---
 src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
index 2216d1a0..0644a3ce 100644
--- a/src/state/image-dataset/csv-dataset/index.ts
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -24,7 +24,8 @@ import {
 } from "../../../constants";
 
 export const DEFAULT_CSV_DATASET_KEY = "csv";
-const reservedKeys = new Set([
+
+const METADATA_KEYS = new Set([
     CELL_ID_KEY,
     FOV_ID_KEY,
     FOV_THUMBNAIL_PATH,
@@ -124,7 +125,7 @@ class CsvRequest implements ImageDataset {
      */
     private getNonReservedFeatureColumns(csvData: Record<string, string>[]): string[] {
         const keys = Object.keys(csvData[0]);
-        return keys.filter((key) => !reservedKeys.has(key));
+        return keys.filter((key) => !METADATA_KEYS.has(key));
     }
 
     /**
@@ -346,7 +347,6 @@ class CsvRequest implements ImageDataset {
         };
 
         for (let i = 0; i < indices.length; i++) {
-            // TODO: Calculate in advance
             const row = this.csvData[i];
             // Copy label data
             labels.cellIds.push(row[CELL_ID_KEY]);
diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
index f2143cac..60f484c8 100644
--- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
+++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
@@ -43,6 +43,7 @@ describe("CsvRequest", () => {
      * - Check for null/NaN values in CSV input
      * - Check for behavior when there is no discrete feature column -> validate groupby
      * - Check for handling of BFF-specific column names (they should be remapped)
+     * - Check that metadata columns are parsed correctly
      * - Check that metadata-related columns are not parsed as features
      */
 });

From df8084df1e93cc1ba7f0bcdc4d29c345c8d07667 Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Thu, 14 Nov 2024 13:58:25 -0800
Subject: [PATCH 5/7] refactor: Renamed methods

---
 src/state/image-dataset/csv-dataset/index.ts | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
index 0644a3ce..ceda570e 100644
--- a/src/state/image-dataset/csv-dataset/index.ts
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -123,7 +123,7 @@ class CsvRequest implements ImageDataset {
      * Returns all of the column names that are not reserved for metadata, with the
      * assumption that they are features.
      */
-    private getNonReservedFeatureColumns(csvData: Record<string, string>[]): string[] {
+    private getFeatureKeysFromColumnNames(csvData: Record<string, string>[]): string[] {
         const keys = Object.keys(csvData[0]);
         return keys.filter((key) => !METADATA_KEYS.has(key));
     }
@@ -217,7 +217,7 @@ class CsvRequest implements ImageDataset {
     private parseFeatures(csvData: Record<string, string>[]): void {
         this.featureInfo.clear();
 
-        const featureKeys = this.getNonReservedFeatureColumns(csvData);
+        const featureKeys = this.getFeatureKeysFromColumnNames(csvData);
         const rawFeatureData = this.getFeatureDataAsColumns(csvData, featureKeys);
 
         for (const key of featureKeys) {
@@ -264,13 +264,10 @@ class CsvRequest implements ImageDataset {
         // Map from cell IDs to row index. If no cell ID is provided, assign the row number.
         for (let i = 0; i < this.csvData.length; i++) {
             const row = this.csvData[i];
-            if (row[CELL_ID_KEY] !== undefined) {
-                this.idToIndex[row[CELL_ID_KEY]] = i;
-            } else {
-                // Substitute with index if no cell ID is provided
+            if (row[CELL_ID_KEY] === undefined) {
                 row[CELL_ID_KEY] = i.toString();
-                this.idToIndex[i.toString()] = i;
             }
+            this.idToIndex[row[CELL_ID_KEY]] = i;
         }
 
         this.parseFeatures(this.csvData);

From ac8ad66d76f9b4c8104ea45c91f37249c3327c39 Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Tue, 19 Nov 2024 17:49:29 -0800
Subject: [PATCH 6/7] refactor: Update comments, use constants, remove
 redundant type casting

---
 src/state/image-dataset/csv-dataset/index.ts | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
index ceda570e..f02d8785 100644
--- a/src/state/image-dataset/csv-dataset/index.ts
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -35,7 +35,8 @@ const METADATA_KEYS = new Set([
     TRANSFORM,
 ]);
 
-// From Adobe categorical colors
+// Adobe palette of high-contrast colors for denoting different categories
+// Used for categorical data
 const DEFAULT_COLORS = [
     "#27B4AE",
     "#4047C4",
@@ -99,7 +100,7 @@ type FeatureInfo =
  * - Transform data, under the column name "transform"
  *
  * Any other columns will be interpreted as features:
- * - Columns containing only numbers will be treated as numeric data.
+ * - Columns containing only numbers will be treated as numeric ("continuous") data.
  * - Columns containing any non-numeric data will be treated as category ("discrete") data.
  */
 class CsvRequest implements ImageDataset {
@@ -115,7 +116,7 @@ class CsvRequest implements ImageDataset {
         this.featureInfo = new Map();
         // TODO: Automatically detect a discrete feature and replace the group
         // by feature with it.
-        this.defaultGroupByFeatureKey = "CellId";
+        this.defaultGroupByFeatureKey = CELL_ID_KEY;
         this.parseCsvData(csvFileContents);
     }
 
@@ -254,7 +255,7 @@ class CsvRequest implements ImageDataset {
     private parseCsvData(csvDataSrc: string): void {
         // TODO: handle URLs and files here: they need to be handled via async callbacks.
         // https://www.papaparse.com/docs#strings
-        const result = Papa.parse(csvDataSrc, { header: true }).data as Record<string, string>[];
+        const result = Papa.parse(csvDataSrc, { header: true }).data;
         this.csvData = result as Record<string, string>[];
 
         if (this.csvData.length === 0) {

From 7dbd6e9994b137df31eecb99c68f53140eb6db07 Mon Sep 17 00:00:00 2001
From: Peyton Lee <peyton.lee@alleninstitute.org>
Date: Tue, 19 Nov 2024 18:06:41 -0800
Subject: [PATCH 7/7] fix: Handled edge case where row numbers and cell IDs
 could collide

---
 src/state/image-dataset/csv-dataset/index.ts  | 13 ++++++++++++-
 .../csv-dataset/test/CsvRequest.test.ts       | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts
index f02d8785..dfee746b 100644
--- a/src/state/image-dataset/csv-dataset/index.ts
+++ b/src/state/image-dataset/csv-dataset/index.ts
@@ -262,10 +262,21 @@ class CsvRequest implements ImageDataset {
             throw new Error("No data found in CSV");
         }
 
-        // Map from cell IDs to row index. If no cell ID is provided, assign the row number.
+        let useOriginalKey = true;
+        // Check if all rows have a cell ID. If not, we must use the row index
+        // instead to prevent duplicate values from being added to the map.
         for (let i = 0; i < this.csvData.length; i++) {
             const row = this.csvData[i];
             if (row[CELL_ID_KEY] === undefined) {
+                useOriginalKey = false;
+                break;
+            }
+        }
+
+        // Map from cell IDs to row index. If no cell ID is provided, assign the row number.
+        for (let i = 0; i < this.csvData.length; i++) {
+            const row = this.csvData[i];
+            if (!useOriginalKey) {
                 row[CELL_ID_KEY] = i.toString();
             }
             this.idToIndex[row[CELL_ID_KEY]] = i;
diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
index 60f484c8..4c1d8738 100644
--- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
+++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts
@@ -35,7 +35,26 @@ describe("CsvRequest", () => {
                 cellIds: ["potato", "garbanzo", "turnip", "rutabaga"],
             },
         });
+
+        it("handles cell id vs. row index collisions when data is incomplete.", async () => {
+            const csvString = `CellId,feature1
+            0,A
+            2,B,
+            ,C
+            5,D`;
+            // Note that in the above CSV, if we directly used the cell ID with
+            // row index as a fallback for undefined values, we would end up with
+            // two rows that have index 2. Instead, the CSV parser should detect
+            // that data is missing and use the row index for all cell IDs instead.
+            const csvData = new CsvRequest(csvString);
+            expect(await csvData.getFileInfoByCellId("0")).to.not.be.undefined;
+            expect(await csvData.getFileInfoByCellId("1")).to.not.be.undefined;
+            expect(await csvData.getFileInfoByCellId("2")).to.not.be.undefined;
+            expect(await csvData.getFileInfoByCellId("3")).to.not.be.undefined;
+            expect(await csvData.getFileInfoByCellId("5")).to.be.undefined;
+        });
     });
+
     /**
      * TODO:
      * - Check for spaces in CSV input