From eb07195ea8bcd79c387c909ede43919957c2344e Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 14 Nov 2024 12:25:21 -0800 Subject: [PATCH 1/7] feat: Added CSV parser as a mock image dataset --- package-lock.json | 16 + package.json | 2 + src/state/image-dataset/csv-dataset/index.ts | 397 +++++++++++++++++++ 3 files changed, 415 insertions(+) create mode 100644 src/state/image-dataset/csv-dataset/index.ts diff --git a/package-lock.json b/package-lock.json index 91ada186..de7bee52 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "firebase": "^7.24.0", "history": "^4.7.2", "lodash": "^4.17.20", + "papaparse": "^5.4.1", "plotly.js": "^2.35.2", "pram": "^2.0.0-alpha.0", "react": "^16.14.0", @@ -44,6 +45,7 @@ "@types/lodash": "^4.14.176", "@types/mini-css-extract-plugin": "^2.2.0", "@types/mocha": "^2.2.44", + "@types/papaparse": "^5.3.15", "@types/plotly.js": "^1.54.10", "@types/postcss-flexbugs-fixes": "^5.0.3", "@types/react": "^16.14.0", @@ -4302,6 +4304,15 @@ "@types/node": "*" } }, + "node_modules/@types/papaparse": { + "version": "5.3.15", + "resolved": "https://registry.npmjs.org/@types/papaparse/-/papaparse-5.3.15.tgz", + "integrity": "sha512-JHe6vF6x/8Z85nCX4yFdDslN11d+1pr12E526X8WAfhadOeaOTx5AuIkvDKIBopfvlzpzkdMx4YyvSKCM9oqtw==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/parse-json": { "version": "4.0.2", "dev": true, @@ -11804,6 +11815,11 @@ "resolved": "https://registry.npmjs.org/pako/-/pako-2.1.0.tgz", "integrity": "sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==" }, + "node_modules/papaparse": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.4.1.tgz", + "integrity": "sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==" + }, "node_modules/param-case": { "version": "3.0.4", "dev": true, diff --git a/package.json b/package.json index d88a1cbe..e695b3a9 100755 --- a/package.json +++ b/package.json @@ -38,6 +38,7 @@ "@types/lodash": "^4.14.176", "@types/mini-css-extract-plugin": "^2.2.0", "@types/mocha": "^2.2.44", + "@types/papaparse": "^5.3.15", "@types/plotly.js": "^1.54.10", "@types/postcss-flexbugs-fixes": "^5.0.3", "@types/react": "^16.14.0", @@ -92,6 +93,7 @@ "firebase": "^7.24.0", "history": "^4.7.2", "lodash": "^4.17.20", + "papaparse": "^5.4.1", "plotly.js": "^2.35.2", "pram": "^2.0.0-alpha.0", "react": "^16.14.0", diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts new file mode 100644 index 00000000..7f6647d7 --- /dev/null +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -0,0 +1,397 @@ +import { ViewerChannelSettings } from "@aics/web-3d-viewer"; +import { Album } from "../.."; +import * as Papa from "papaparse"; +import { + DataForPlot, + MeasuredFeatureDef, + FileInfo, + PerCellLabels, + MeasuredFeaturesOption, + DiscreteMeasuredFeatureDef, + ContinuousMeasuredFeatureDef, +} from "../../metadata/types"; +import { ImageDataset, InitialDatasetSelections, Megaset } from "../types"; +import firebase from "firebase"; +import { + CELL_ID_KEY, + FOV_ID_KEY, + FOV_THUMBNAIL_PATH, + FOV_VOLUME_VIEWER_PATH, + GROUP_BY_KEY, + THUMBNAIL_PATH, + TRANSFORM, + VOLUME_VIEWER_PATH, +} from "../../../constants"; + +export const DEFAULT_CSV_DATASET_KEY = "csv"; +const reservedKeys = new Set([ + CELL_ID_KEY, + FOV_ID_KEY, + FOV_THUMBNAIL_PATH, + FOV_VOLUME_VIEWER_PATH, + THUMBNAIL_PATH, + VOLUME_VIEWER_PATH, + TRANSFORM, +]); + +// From Adobe categorical colors +const DEFAULT_COLORS = [ + "#27B4AE", + "#4047C4", + "#F48730", + "#DB4281", + "#7E84F4", + "#78DF76", + "#1C7AED", + "#7129CD", + "#E7C73B", + "#C95F1E", + "#188E61", + "#BEE952", +]; + +function isNumeric(value: string): boolean { + if (typeof value != "string") { + return false; + } + return !isNaN(Number(value)) && !isNaN(parseFloat(value)); +} + +function isStringArray(data: string[] | (number | null)[]): data is string[] { + return data.length > 0 && typeof data[0] === "string"; +} + +const enum FeatureType { + CONTINUOUS, + DISCRETE, +} + +type FeatureInfo = + | { + type: FeatureType.CONTINUOUS; + def: ContinuousMeasuredFeatureDef; + data: (number | null)[]; + } + | { + type: FeatureType.DISCRETE; + def: DiscreteMeasuredFeatureDef; + data: (number | null)[]; + }; + +/** + * Parses and mocks an ImageDataset from a provided CSV string with a header row. + * + * The CSV must contain: + * - URL paths to volume data, under the column name "volumeViewerPath" + * + * Optionally, the CSV can contain: + * - Thumbnail paths, under the column name "thumbnailPath" + * - Cell IDs, under the column name "CellId" + * - FOV IDs, under the column name "FOVId" + * - FOV thumbnail paths, under the column name "fovThumbnailPath" + * - FOV volume data, under the column name "fovVolumeviewerPath" + * + * Some data will be ignored by default: + * - Transform data, under the column name "transform" + * + * Any other columns will be interpreted as features: + * - Columns containing only numbers will be treated as numeric data. + * - Columns containing any non-numeric data will be treated as category ("discrete") data. + */ +class CsvRequest implements ImageDataset { + csvData: Record[]; + idToIndex: Record; + featureInfo: Map; + + defaultGroupByFeatureKey: string; + + constructor(csvFileContents: string) { + this.csvData = []; + this.idToIndex = {}; + this.featureInfo = new Map(); + // TODO: Automatically detect a discrete feature and replace the group + // by feature with it. + this.defaultGroupByFeatureKey = "CellId"; + this.parseCsvData(csvFileContents); + } + + /** + * Returns all of the column names that are not reserved for metadata, with the + * assumption that they are features. + */ + private getNonReservedFeatureColumns(csvData: Record[]): string[] { + const keys = Object.keys(csvData[0]); + return keys.filter((key) => !reservedKeys.has(key)); + } + + /** + * Used for default initialization. Returns the feature key at the given index, + * clamped to the length of the features array. + */ + private getFeatureKeyClamped(featureKeys: string[], index: number): string { + const lastIndex = featureKeys.length - 1; + return featureKeys[Math.min(Math.max(index, 0), lastIndex)]; + } + + /** + * Returns the feature data as a map from the feature name to a array of either + * numeric or string values. + */ + private getFeatureDataAsColumns( + csvData: Record[], + featureKeys: string[] + ): Map { + const featureData = new Map(); + for (const key of featureKeys) { + const rawValues: string[] = []; + let isContinuous = true; + for (const row of csvData) { + rawValues.push(row[key]); + if (!isNumeric(row[key])) { + isContinuous = false; + } + } + + if (isContinuous) { + // Feature is continuous, parse all values as numeric + const values = rawValues.map((val) => Number.parseFloat(val)); + featureData.set(key, values); + } else { + // Feature is discrete, return directly + featureData.set(key, rawValues); + } + } + return featureData; + } + + private parseDiscreteFeature( + key: string, + data: string[] + ): { def: DiscreteMeasuredFeatureDef; data: (number | null)[] } { + const strValueToIndex = new Map(); + const remappedValues: (number | null)[] = []; + + // Iterate through all values and count them. Replace the values with their + // corresponding index. + for (let i = 0; i < data.length; i++) { + const value = data[i]; + let indexInfo = strValueToIndex.get(value); + if (!indexInfo) { + // Assign new index to this value + indexInfo = { index: strValueToIndex.size, count: 0 }; + strValueToIndex.set(value, indexInfo); + } + + indexInfo.count++; + remappedValues.push(indexInfo.index); + } + + const options: Record = {}; + for (const [value, { index, count }] of strValueToIndex.entries()) { + options[index.toString()] = { + color: DEFAULT_COLORS[index % DEFAULT_COLORS.length], + name: value, + key: value, + count: count, + }; + } + + return { + def: { + discrete: true, + displayName: key, + description: key, + key, + options, + tooltip: key, + }, + data: remappedValues, + }; + } + + private parseFeatures(csvData: Record[]): void { + this.featureInfo.clear(); + + const featureKeys = this.getNonReservedFeatureColumns(csvData); + const rawFeatureData = this.getFeatureDataAsColumns(csvData, featureKeys); + + for (const key of featureKeys) { + const data = rawFeatureData.get(key); + if (!data) { + continue; + } + if (isStringArray(data)) { + const { def, data: discreteData } = this.parseDiscreteFeature(key, data); + this.featureInfo.set(key, { + type: FeatureType.DISCRETE, + def, + data: discreteData, + }); + } else { + const def: ContinuousMeasuredFeatureDef = { + discrete: false, + displayName: key, + description: key, + key, + tooltip: key, + }; + this.featureInfo.set(key, { + type: FeatureType.CONTINUOUS, + def, + data: data, + }); + } + } + + // TODO: Feature defs can include units. Should we strip that from the feature column name? + } + + private parseCsvData(csvDataSrc: string): void { + // TODO: handle URLs and files here: they need to be handled via async callbacks. + // https://www.papaparse.com/docs#strings + const result = Papa.parse(csvDataSrc, { header: true }).data as Record[]; + this.csvData = result as Record[]; + + if (this.csvData.length === 0) { + throw new Error("No data found in CSV"); + } + + // Map from cell IDs to row index. If no cell ID is provided, assign the row number. + for (let i = 0; i < this.csvData.length; i++) { + const row = this.csvData[i]; + if (row[CELL_ID_KEY] !== undefined) { + this.idToIndex[row[CELL_ID_KEY]] = i; + } else { + // Substitute with index if no cell ID is provided + row[CELL_ID_KEY] = i.toString(); + this.idToIndex[i.toString()] = i; + } + } + + this.parseFeatures(this.csvData); + } + + selectDataset(): Promise { + const featureKeys = Array.from(this.featureInfo.keys()); + return Promise.resolve({ + defaultXAxis: this.getFeatureKeyClamped(featureKeys, 0), + defaultYAxis: this.getFeatureKeyClamped(featureKeys, 1), + defaultColorBy: this.defaultGroupByFeatureKey, + defaultGroupBy: this.defaultGroupByFeatureKey, + // TODO: Provide the containing folder of the CSV if the values for the columns (thumbnails, + // downloads, volumes) are relative paths and not HTTPS URLs. + thumbnailRoot: "", + downloadRoot: "", + volumeViewerDataRoot: "", + }); + } + + getAvailableDatasets(): Promise { + // Only has one dataset (imported CSV) + const fakeSet: Megaset = { + name: "csv", + title: "CSV Dataset", + production: false, + dateCreated: firebase.firestore.Timestamp.now(), + datasets: { + csv: { + name: "csv", + title: "CSV Dataset", + version: "1", + id: DEFAULT_CSV_DATASET_KEY, + description: "A dataset imported from a CSV file", + index: 0, + userData: {}, + }, + }, + }; + + return Promise.resolve([fakeSet]); + } + + getViewerChannelSettings(): Promise { + // By default, enable first three channels + // TODO: Have this constant be exposed by w3cv? + return Promise.resolve({ + groups: [ + { + name: "Channels", + channels: [ + { match: [0, 1, 2], enabled: true }, + { match: "(.+)", enabled: false }, + ], + }, + ], + }); + } + + private getFeatureKeyToData(): Record { + const featureKeyToData: Record = {}; + for (const [key, info] of this.featureInfo.entries()) { + featureKeyToData[key] = info.data; + } + return featureKeyToData; + } + + getFeatureData(): Promise { + const indices = this.csvData.map((_row, index) => index); + const values: Record = this.getFeatureKeyToData(); + const labels: PerCellLabels = { + thumbnailPaths: [], + cellIds: [], + }; + + for (let i = 0; i < indices.length; i++) { + // TODO: Calculate in advance + const row = this.csvData[i]; + // Copy label data + labels.cellIds.push(row[CELL_ID_KEY]); + labels.thumbnailPaths.push(row[THUMBNAIL_PATH] || ""); + } + + return Promise.resolve({ + indices, + values, + labels, + }); + } + + getAlbumData(): Promise { + return Promise.resolve([]); + } + + getMeasuredFeatureDefs(): Promise { + const featureDefsArray = Array.from(this.featureInfo.values()).map((info) => info.def); + return Promise.resolve(featureDefsArray); + } + + getFileInfoByCellId(id: string): Promise { + const rowIndex = this.idToIndex[id]; + if (rowIndex === undefined) { + return Promise.resolve(undefined); + } + const data = this.csvData[rowIndex]; + + if (!data) { + return Promise.resolve(undefined); + } + const fileInfo = { + [CELL_ID_KEY]: data[CELL_ID_KEY] || "", + [FOV_ID_KEY]: data[FOV_ID_KEY] || "", + [FOV_THUMBNAIL_PATH]: data[FOV_THUMBNAIL_PATH] || "", + [FOV_VOLUME_VIEWER_PATH]: data[FOV_VOLUME_VIEWER_PATH] || "", + [THUMBNAIL_PATH]: data[THUMBNAIL_PATH] || "", + [VOLUME_VIEWER_PATH]: data[VOLUME_VIEWER_PATH] || "", + [GROUP_BY_KEY]: data[GROUP_BY_KEY] || this.defaultGroupByFeatureKey, + }; + return Promise.resolve(fileInfo); + } + + getFileInfoByArrayOfCellIds(ids: string[]): Promise<(FileInfo | undefined)[]> { + const promises = ids.map((id) => this.getFileInfoByCellId(id)); + const result = Promise.all(promises); + return Promise.resolve(result); + } +} + +export default CsvRequest; From a2549991b6e519800f9cc46dd5f65b19e24d543e Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 14 Nov 2024 12:45:47 -0800 Subject: [PATCH 2/7] feat: Added unit tests, handled NaN values --- src/state/image-dataset/csv-dataset/index.ts | 4 ++ .../csv-dataset/test/CsvRequest.test.ts | 39 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts index 7f6647d7..2216d1a0 100644 --- a/src/state/image-dataset/csv-dataset/index.ts +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -54,6 +54,9 @@ function isNumeric(value: string): boolean { if (typeof value != "string") { return false; } + if (value.trim().toLowerCase() === "nan") { + return true; + } return !isNaN(Number(value)) && !isNaN(parseFloat(value)); } @@ -154,6 +157,7 @@ class CsvRequest implements ImageDataset { if (isContinuous) { // Feature is continuous, parse all values as numeric + // TODO: Handle empty/blank values const values = rawValues.map((val) => Number.parseFloat(val)); featureData.set(key, values); } else { diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts new file mode 100644 index 00000000..ed823690 --- /dev/null +++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts @@ -0,0 +1,39 @@ +import { expect } from "chai"; +import CsvRequest from ".."; + +const testCsv = + "CellId,volumeviewerPath,thumbnailPath,feature1,feature2,feature3,discretefeature" + + "\npotato,https://example.com/1/raw.ome.zarr,https://example.com/1.png,1,2,3,A" + + "\ngarbanzo,https://example.com/2/raw.ome.zarr,https://example.com/2.jpeg,7,3.4,1,B" + + "\nturnip,https://example.com/3/raw.ome.zarr,https://example.com/3.jpeg,4,5,6,B" + + "\nrutabaga,https://example.com/4/raw.ome.zarr,https://example.com/4.jpeg,9,2.8,NaN,C"; + +describe("CsvRequest", () => { + it("can be initialized with test data", () => { + new CsvRequest(testCsv); + }); + + it("extracts feature data", async () => { + const csvDataset = new CsvRequest(testCsv); + + const featureData = await csvDataset.getFeatureData(); + expect(featureData).to.deep.equal({ + indices: [0, 1, 2, 3], + values: { + feature1: [1, 7, 4, 9], + feature2: [2, 3.4, 5, 2.8], + feature3: [3, 1, 6, NaN], + discretefeature: [0, 1, 1, 2], + }, + labels: { + thumbnailPaths: [ + "https://example.com/1.png", + "https://example.com/2.jpeg", + "https://example.com/3.jpeg", + "https://example.com/4.jpeg", + ], + cellIds: ["potato", "garbanzo", "turnip", "rutabaga"], + }, + }); + }); +}); From 5196cd3f1afbd64f6e6e8aa35d5f48b27c9e0a58 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 14 Nov 2024 12:49:46 -0800 Subject: [PATCH 3/7] doc: Added additional TODOs for CSV unit tests --- .../image-dataset/csv-dataset/test/CsvRequest.test.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts index ed823690..f2143cac 100644 --- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts +++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts @@ -36,4 +36,13 @@ describe("CsvRequest", () => { }, }); }); + /** + * TODO: + * - Check for spaces in CSV input + * - Check for empty values in CSV input + * - Check for null/NaN values in CSV input + * - Check for behavior when there is no discrete feature column -> validate groupby + * - Check for handling of BFF-specific column names (they should be remapped) + * - Check that metadata-related columns are not parsed as features + */ }); From 608f0a01e4138f57607122f4d926ea89f864ec4f Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 14 Nov 2024 13:03:39 -0800 Subject: [PATCH 4/7] refactor: Renamed constants --- src/state/image-dataset/csv-dataset/index.ts | 6 +++--- src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts index 2216d1a0..0644a3ce 100644 --- a/src/state/image-dataset/csv-dataset/index.ts +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -24,7 +24,8 @@ import { } from "../../../constants"; export const DEFAULT_CSV_DATASET_KEY = "csv"; -const reservedKeys = new Set([ + +const METADATA_KEYS = new Set([ CELL_ID_KEY, FOV_ID_KEY, FOV_THUMBNAIL_PATH, @@ -124,7 +125,7 @@ class CsvRequest implements ImageDataset { */ private getNonReservedFeatureColumns(csvData: Record[]): string[] { const keys = Object.keys(csvData[0]); - return keys.filter((key) => !reservedKeys.has(key)); + return keys.filter((key) => !METADATA_KEYS.has(key)); } /** @@ -346,7 +347,6 @@ class CsvRequest implements ImageDataset { }; for (let i = 0; i < indices.length; i++) { - // TODO: Calculate in advance const row = this.csvData[i]; // Copy label data labels.cellIds.push(row[CELL_ID_KEY]); diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts index f2143cac..60f484c8 100644 --- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts +++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts @@ -43,6 +43,7 @@ describe("CsvRequest", () => { * - Check for null/NaN values in CSV input * - Check for behavior when there is no discrete feature column -> validate groupby * - Check for handling of BFF-specific column names (they should be remapped) + * - Check that metadata columns are parsed correctly * - Check that metadata-related columns are not parsed as features */ }); From df8084df1e93cc1ba7f0bcdc4d29c345c8d07667 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 14 Nov 2024 13:58:25 -0800 Subject: [PATCH 5/7] refactor: Renamed methods --- src/state/image-dataset/csv-dataset/index.ts | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts index 0644a3ce..ceda570e 100644 --- a/src/state/image-dataset/csv-dataset/index.ts +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -123,7 +123,7 @@ class CsvRequest implements ImageDataset { * Returns all of the column names that are not reserved for metadata, with the * assumption that they are features. */ - private getNonReservedFeatureColumns(csvData: Record[]): string[] { + private getFeatureKeysFromColumnNames(csvData: Record[]): string[] { const keys = Object.keys(csvData[0]); return keys.filter((key) => !METADATA_KEYS.has(key)); } @@ -217,7 +217,7 @@ class CsvRequest implements ImageDataset { private parseFeatures(csvData: Record[]): void { this.featureInfo.clear(); - const featureKeys = this.getNonReservedFeatureColumns(csvData); + const featureKeys = this.getFeatureKeysFromColumnNames(csvData); const rawFeatureData = this.getFeatureDataAsColumns(csvData, featureKeys); for (const key of featureKeys) { @@ -264,13 +264,10 @@ class CsvRequest implements ImageDataset { // Map from cell IDs to row index. If no cell ID is provided, assign the row number. for (let i = 0; i < this.csvData.length; i++) { const row = this.csvData[i]; - if (row[CELL_ID_KEY] !== undefined) { - this.idToIndex[row[CELL_ID_KEY]] = i; - } else { - // Substitute with index if no cell ID is provided + if (row[CELL_ID_KEY] === undefined) { row[CELL_ID_KEY] = i.toString(); - this.idToIndex[i.toString()] = i; } + this.idToIndex[row[CELL_ID_KEY]] = i; } this.parseFeatures(this.csvData); From ac8ad66d76f9b4c8104ea45c91f37249c3327c39 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Tue, 19 Nov 2024 17:49:29 -0800 Subject: [PATCH 6/7] refactor: Update comments, use constants, remove redundant type casting --- src/state/image-dataset/csv-dataset/index.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts index ceda570e..f02d8785 100644 --- a/src/state/image-dataset/csv-dataset/index.ts +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -35,7 +35,8 @@ const METADATA_KEYS = new Set([ TRANSFORM, ]); -// From Adobe categorical colors +// Adobe palette of high-contrast colors for denoting different categories +// Used for categorical data const DEFAULT_COLORS = [ "#27B4AE", "#4047C4", @@ -99,7 +100,7 @@ type FeatureInfo = * - Transform data, under the column name "transform" * * Any other columns will be interpreted as features: - * - Columns containing only numbers will be treated as numeric data. + * - Columns containing only numbers will be treated as numeric ("continuous") data. * - Columns containing any non-numeric data will be treated as category ("discrete") data. */ class CsvRequest implements ImageDataset { @@ -115,7 +116,7 @@ class CsvRequest implements ImageDataset { this.featureInfo = new Map(); // TODO: Automatically detect a discrete feature and replace the group // by feature with it. - this.defaultGroupByFeatureKey = "CellId"; + this.defaultGroupByFeatureKey = CELL_ID_KEY; this.parseCsvData(csvFileContents); } @@ -254,7 +255,7 @@ class CsvRequest implements ImageDataset { private parseCsvData(csvDataSrc: string): void { // TODO: handle URLs and files here: they need to be handled via async callbacks. // https://www.papaparse.com/docs#strings - const result = Papa.parse(csvDataSrc, { header: true }).data as Record[]; + const result = Papa.parse(csvDataSrc, { header: true }).data; this.csvData = result as Record[]; if (this.csvData.length === 0) { From 7dbd6e9994b137df31eecb99c68f53140eb6db07 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Tue, 19 Nov 2024 18:06:41 -0800 Subject: [PATCH 7/7] fix: Handled edge case where row numbers and cell IDs could collide --- src/state/image-dataset/csv-dataset/index.ts | 13 ++++++++++++- .../csv-dataset/test/CsvRequest.test.ts | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/state/image-dataset/csv-dataset/index.ts b/src/state/image-dataset/csv-dataset/index.ts index f02d8785..dfee746b 100644 --- a/src/state/image-dataset/csv-dataset/index.ts +++ b/src/state/image-dataset/csv-dataset/index.ts @@ -262,10 +262,21 @@ class CsvRequest implements ImageDataset { throw new Error("No data found in CSV"); } - // Map from cell IDs to row index. If no cell ID is provided, assign the row number. + let useOriginalKey = true; + // Check if all rows have a cell ID. If not, we must use the row index + // instead to prevent duplicate values from being added to the map. for (let i = 0; i < this.csvData.length; i++) { const row = this.csvData[i]; if (row[CELL_ID_KEY] === undefined) { + useOriginalKey = false; + break; + } + } + + // Map from cell IDs to row index. If no cell ID is provided, assign the row number. + for (let i = 0; i < this.csvData.length; i++) { + const row = this.csvData[i]; + if (!useOriginalKey) { row[CELL_ID_KEY] = i.toString(); } this.idToIndex[row[CELL_ID_KEY]] = i; diff --git a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts index 60f484c8..4c1d8738 100644 --- a/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts +++ b/src/state/image-dataset/csv-dataset/test/CsvRequest.test.ts @@ -35,7 +35,26 @@ describe("CsvRequest", () => { cellIds: ["potato", "garbanzo", "turnip", "rutabaga"], }, }); + + it("handles cell id vs. row index collisions when data is incomplete.", async () => { + const csvString = `CellId,feature1 + 0,A + 2,B, + ,C + 5,D`; + // Note that in the above CSV, if we directly used the cell ID with + // row index as a fallback for undefined values, we would end up with + // two rows that have index 2. Instead, the CSV parser should detect + // that data is missing and use the row index for all cell IDs instead. + const csvData = new CsvRequest(csvString); + expect(await csvData.getFileInfoByCellId("0")).to.not.be.undefined; + expect(await csvData.getFileInfoByCellId("1")).to.not.be.undefined; + expect(await csvData.getFileInfoByCellId("2")).to.not.be.undefined; + expect(await csvData.getFileInfoByCellId("3")).to.not.be.undefined; + expect(await csvData.getFileInfoByCellId("5")).to.be.undefined; + }); }); + /** * TODO: * - Check for spaces in CSV input