Skip to content

Commit

Permalink
feature: Parse BFF CSVs (CSV pt. 2) (#215)
Browse files Browse the repository at this point in the history
* feat: Added CSV parser as a mock image dataset

* feat: Added unit tests, handled NaN values

* doc: Added additional TODOs for CSV unit tests

* refactor: Renamed constants

* refactor: Renamed methods

* feat: Check for BFF keys

* refactor: Simplified default groupby feature behavior

* fix: Added handling for spaces in headers and values

* feat: Added additional unit tests for data validation

* feat: Added unit tests for BFF parsing validation

* refactor: Code cleanup

* fix: Handled empty string in cell ID

* refactor: Renamed methods for clarity

* refactor: Added remapping helper function

* refactor: Moved types, const assignment, docstring updates
  • Loading branch information
ShrimpCryptid authored Dec 2, 2024
1 parent adff277 commit b126f16
Show file tree
Hide file tree
Showing 2 changed files with 475 additions and 71 deletions.
225 changes: 185 additions & 40 deletions src/state/image-dataset/csv-dataset/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ import {
} from "../../../constants";

export const DEFAULT_CSV_DATASET_KEY = "csv";
export const DEFAULT_GROUPBY_NONE = "_defaultGroupByNone";

const BFF_FILE_ID_KEY = "File ID";
const BFF_THUMBNAIL_PATH_KEY = "Thumbnail";
const BFF_FILE_PATH_KEY = "File Path";
const BFF_DEFAULT_GROUP_BY_KEY = "Cell Line";
const BFF_FILENAME_KEY = "File Name";
const BFF_FILE_SIZE_KEY = "File Size";
const BFF_UPLOADED_KEY = "Uploaded";

const METADATA_KEYS = new Set([
CELL_ID_KEY,
Expand All @@ -33,6 +42,12 @@ const METADATA_KEYS = new Set([
THUMBNAIL_PATH,
VOLUME_VIEWER_PATH,
TRANSFORM,
BFF_FILE_ID_KEY,
BFF_THUMBNAIL_PATH_KEY,
BFF_FILE_PATH_KEY,
BFF_FILENAME_KEY,
BFF_FILE_SIZE_KEY,
BFF_UPLOADED_KEY,
]);

// Adobe palette of high-contrast colors for denoting different categories
Expand All @@ -52,20 +67,6 @@ const DEFAULT_COLORS = [
"#BEE952",
];

function isNumeric(value: string): boolean {
if (typeof value != "string") {
return false;
}
if (value.trim().toLowerCase() === "nan") {
return true;
}
return !isNaN(Number(value)) && !isNaN(parseFloat(value));
}

function isStringArray(data: string[] | (number | null)[]): data is string[] {
return data.length > 0 && typeof data[0] === "string";
}

const enum FeatureType {
CONTINUOUS,
DISCRETE,
Expand All @@ -83,25 +84,74 @@ type FeatureInfo =
data: (number | null)[];
};

type FeatureData = (number | null)[] | (string | null)[];

function isNullOrNumericString(value: string | null): boolean {
if (value === null) {
return true;
} else if (typeof value != "string") {
return false;
} else if (value.trim().toLowerCase() === "nan") {
return true;
}

return !isNaN(Number(value)) && !isNaN(parseFloat(value));
}

/**
* Determines whether a feature data object is a string array (vs. a number array).
* If all values are `null`, returns false.
*/
function isStringArray(data: FeatureData): data is (string | null)[] {
// Find first non-null value and determine type from it
for (let i = 0; i < data.length; i++) {
if (data[i] !== null) {
return typeof data[i] === "string";
}
}
return false;
}

/**
* Returns true if the feature data array contains at least one non-null value.
* If all values are `null` (or the array is empty), returns false.
*/
function isValidFeatureArray(data: FeatureData): boolean {
if (data.length === 0) {
return false;
}
for (let i = 0; i < data.length; i++) {
if (data[i] !== null) {
return true;
}
}
return false;
}

/**
* Parses and mocks an ImageDataset from a provided CSV string with a header row.
*
* The CSV must contain:
* - URL paths to volume data, under the column name "volumeViewerPath"
* - URL paths to volume data, under the column name "volumeViewerPath" or "File Path"
*
* Optionally, the CSV can contain:
* - Thumbnail paths, under the column name "thumbnailPath"
* - Cell IDs, under the column name "CellId"
* - Thumbnail paths, under the column name "thumbnailPath" or "Thumbnail"
* - Cell IDs, under the column name "CellId" or "File ID"
* - FOV IDs, under the column name "FOVId"
* - FOV thumbnail paths, under the column name "fovThumbnailPath"
* - FOV volume data, under the column name "fovVolumeviewerPath"
*
* Some data will be ignored by default:
* - Transform data, under the column name "transform"
* - Filename, under the column name "File Name"
* - File size, under the column name "File Size"
* - Uploaded, a 0/1 flag stored under the column name "Uploaded"
*
* Any other columns will be interpreted as features:
* - Columns containing only numbers will be treated as numeric ("continuous") data.
* - Columns containing any non-numeric data will be treated as category ("discrete") data.
* - Columns containing only number and `null` values will be treated as numeric ("continuous") data.
* (note: `NaN` values are allowed.)
* - Columns containing ANY non-numeric or non-null data will be treated as
* category ("discrete") data.
*/
class CsvRequest implements ImageDataset {
csvData: Record<string, string>[];
Expand All @@ -114,19 +164,17 @@ class CsvRequest implements ImageDataset {
this.csvData = [];
this.idToIndex = {};
this.featureInfo = new Map();
// TODO: Automatically detect a discrete feature and replace the group
// by feature with it.
this.defaultGroupByFeatureKey = CELL_ID_KEY;
this.defaultGroupByFeatureKey = "";
this.parseCsvData(csvFileContents);
}

/**
* Returns all of the column names that are not reserved for metadata, with the
* assumption that they are features.
* Returns all of the column names that are not empty or reserved for metadata,
* with the assumption that they are features.
*/
private getFeatureKeysFromColumnNames(csvData: Record<string, string>[]): string[] {
const keys = Object.keys(csvData[0]);
return keys.filter((key) => !METADATA_KEYS.has(key));
return keys.filter((key) => !METADATA_KEYS.has(key) && key !== "");
}

/**
Expand All @@ -145,22 +193,27 @@ class CsvRequest implements ImageDataset {
private getFeatureDataAsColumns(
csvData: Record<string, string>[],
featureKeys: string[]
): Map<string, string[] | number[]> {
const featureData = new Map<string, string[] | number[]>();
): Map<string, FeatureData> {
const featureData = new Map<string, FeatureData>();
for (const key of featureKeys) {
const rawValues: string[] = [];
const rawValues: (string | null)[] = [];
let isContinuous = true;
for (const row of csvData) {
rawValues.push(row[key]);
if (!isNumeric(row[key])) {
isContinuous = false;
const value = row[key] ?? null;
if (value === null || value.trim() === "") {
rawValues.push(null);
} else {
rawValues.push(value);
if (!isNullOrNumericString(value)) {
isContinuous = false;
}
}
}

if (isContinuous) {
// Feature is continuous, parse all values as numeric
// TODO: Handle empty/blank values
const values = rawValues.map((val) => Number.parseFloat(val));
const values = rawValues.map((val) => (val ? Number.parseFloat(val) : null));
featureData.set(key, values);
} else {
// Feature is discrete, return directly
Expand All @@ -172,15 +225,20 @@ class CsvRequest implements ImageDataset {

private parseDiscreteFeature(
key: string,
data: string[]
data: (string | null)[]
): { def: DiscreteMeasuredFeatureDef; data: (number | null)[] } {
const strValueToIndex = new Map<string, { index: number; count: number }>();
const remappedValues: (number | null)[] = [];

// Iterate through all values and count them. Replace the values with their
// corresponding index.
for (let i = 0; i < data.length; i++) {
const value = data[i];
const rawValue = data[i];
if (rawValue === null) {
remappedValues.push(null);
continue;
}
const value = rawValue.trim();
let indexInfo = strValueToIndex.get(value);
if (!indexInfo) {
// Assign new index to this value
Expand Down Expand Up @@ -223,7 +281,7 @@ class CsvRequest implements ImageDataset {

for (const key of featureKeys) {
const data = rawFeatureData.get(key);
if (!data) {
if (!data || !isValidFeatureArray(data)) {
continue;
}
if (isStringArray(data)) {
Expand Down Expand Up @@ -252,22 +310,108 @@ class CsvRequest implements ImageDataset {
// TODO: Feature defs can include units. Should we strip that from the feature column name?
}

/**
* Assigns a default group-by feature key for the dataset. Datasets must have a
* discrete group-by feature or CFE will crash.
*
* Key is chosen in the following order:
* 1. Default BFF group by key ("Cell Line") if it exists
* 2. First discrete feature key if it exists
* 3. A default bin feature if no discrete feature exists
*/
private assignDefaultGroupByFeatureKey(csvData: Record<string, string>[]): void {
// Check if the BFF-specific default group-by feature exists.
const firstRow = csvData[0];
if (firstRow && firstRow[BFF_DEFAULT_GROUP_BY_KEY] !== undefined) {
this.defaultGroupByFeatureKey = BFF_DEFAULT_GROUP_BY_KEY;
return;
}

// If not, assign the first discrete feature as the default group-by feature if it exists.
const firstDiscreteFeature = Array.from(this.featureInfo.values()).find(
(info) => info.type === FeatureType.DISCRETE
);
if (firstDiscreteFeature) {
this.defaultGroupByFeatureKey = firstDiscreteFeature.def.key;
return;
}

// If no discrete feature is found, assign a default group-by feature.
const options: Record<string, MeasuredFeaturesOption> = {
"0": {
color: DEFAULT_COLORS[0],
name: "Default",
key: "0",
count: csvData.length,
},
};
this.featureInfo.set(DEFAULT_GROUPBY_NONE, {
type: FeatureType.DISCRETE,
def: {
discrete: true,
displayName: "(None)",
description: "(None)",
key: DEFAULT_GROUPBY_NONE,
tooltip: "(None)",
options,
},
data: new Array(csvData.length).fill(0),
});
this.defaultGroupByFeatureKey = DEFAULT_GROUPBY_NONE;
}

/**
* Copies the value of a column to another column if the destination column is empty.
* Returns whether the column was copied.
*/
private copyColumnIfEmpty(
row: Record<string, string>,
columnSrc: string,
columnDst: string
): boolean {
if (row[columnSrc] !== undefined && row[columnDst] === undefined) {
row[columnDst] = row[columnSrc];
return true;
}
return false;
}

private remapBffKeys = (row: Record<string, string>): void => {
// Use File ID preferentially, but fall back to Filename if File ID is empty
if (!this.copyColumnIfEmpty(row, BFF_FILE_ID_KEY, CELL_ID_KEY)) {
this.copyColumnIfEmpty(row, BFF_FILENAME_KEY, CELL_ID_KEY);
}
this.copyColumnIfEmpty(row, BFF_THUMBNAIL_PATH_KEY, THUMBNAIL_PATH);
this.copyColumnIfEmpty(row, BFF_FILE_PATH_KEY, VOLUME_VIEWER_PATH);
};

private parseCsvData(csvDataSrc: string): void {
// TODO: handle URLs and files here: they need to be handled via async callbacks.
// https://www.papaparse.com/docs#strings
const result = Papa.parse(csvDataSrc, { header: true }).data;
const config: Papa.ParseConfig = {
header: true,
transformHeader: (header: string) => header.trim(),
skipEmptyLines: "greedy", // skips whitespace-only lines
};
const result = Papa.parse(csvDataSrc, config).data as Record<string, string>[];
this.csvData = result as Record<string, string>[];

// Some assertion tests, throw errors if data can't be parsed
if (this.csvData.length === 0) {
throw new Error("No data found in CSV");
}

let useOriginalKey = true;
// Map certain BFF keys to the standard keys
for (let i = 0; i < this.csvData.length; i++) {
this.remapBffKeys(this.csvData[i]);
}

// Check if all rows have a cell ID. If not, we must use the row index
// instead to prevent duplicate values from being added to the map.
let useOriginalKey = true;
for (let i = 0; i < this.csvData.length; i++) {
const row = this.csvData[i];
if (row[CELL_ID_KEY] === undefined) {
if (row[CELL_ID_KEY] === undefined || row[CELL_ID_KEY].trim() === "") {
useOriginalKey = false;
break;
}
Expand All @@ -279,10 +423,11 @@ class CsvRequest implements ImageDataset {
if (!useOriginalKey) {
row[CELL_ID_KEY] = i.toString();
}
this.idToIndex[row[CELL_ID_KEY]] = i;
this.idToIndex[row[CELL_ID_KEY].trim()] = i;
}

this.parseFeatures(this.csvData);
this.assignDefaultGroupByFeatureKey(this.csvData);
}

selectDataset(): Promise<InitialDatasetSelections> {
Expand Down Expand Up @@ -347,7 +492,7 @@ class CsvRequest implements ImageDataset {
return featureKeyToData;
}

getFeatureData(): Promise<DataForPlot | void> {
getFeatureData(): Promise<DataForPlot> {
const indices = this.csvData.map((_row, index) => index);
const values: Record<string, (number | null)[]> = this.getFeatureKeyToData();
const labels: PerCellLabels = {
Expand Down
Loading

0 comments on commit b126f16

Please sign in to comment.