diff --git a/python/deeplake/__init__.py b/python/deeplake/__init__.py index 5684486de2..538b73c46e 100644 --- a/python/deeplake/__init__.py +++ b/python/deeplake/__init__.py @@ -14,7 +14,7 @@ def progress_bar(iterable, *args, **kwargs): import deeplake from ._deeplake import * -__version__ = "4.1.1" +__version__ = "4.1.2" __all__ = [ "__version__", @@ -128,9 +128,7 @@ def progress_bar(iterable, *args, **kwargs): "types", "Client", "client", - "__child_atfork", "__prepare_atfork", - "__parent_atfork", ] @@ -259,7 +257,7 @@ def transfer_with_links(source, dest, links, column_names): def __register_at_fork(): - from ._deeplake import __prepare_atfork, __parent_atfork, __child_atfork + from ._deeplake import __prepare_atfork UNSAFE_TYPES = ( Dataset, @@ -300,13 +298,13 @@ def check_main_globals_for_unsafe_types(): def before_fork(): check_main_globals_for_unsafe_types() - __prepare_atfork() + pass def after_fork_parent(): - __parent_atfork() + pass def after_fork_child(): - __child_atfork() + pass os.register_at_fork( before=before_fork, @@ -314,5 +312,11 @@ def after_fork_child(): after_in_child=after_fork_child, ) + ff = os.fork + def fork(): + __prepare_atfork() + return ff() + + os.fork = fork __register_at_fork() diff --git a/python/deeplake/__init__.pyi b/python/deeplake/__init__.pyi index 7fb934332a..7ee819eb5b 100644 --- a/python/deeplake/__init__.pyi +++ b/python/deeplake/__init__.pyi @@ -118,304 +118,572 @@ __all__ = [ "types", "Client", "client", - "__child_atfork", "__prepare_atfork", - "__parent_atfork", ] class Future: """ - A future that represents a value that will be resolved in the future. + A future representing an asynchronous operation result in ML pipelines. - Once the Future is resolved, it will hold the result, and you can retrieve it - using either a blocking call (`result()`) or via asynchronous mechanisms (`await`). - - The future will resolve automatically even if you do not explicitly wait for it. + The Future class enables non-blocking operations for data loading and processing, + particularly useful when working with large ML datasets or distributed training. + Once resolved, the Future holds the operation result which can be accessed either + synchronously or asynchronously. Methods: result() -> typing.Any: - Blocks until the Future is resolved and returns the object. + Blocks until the Future resolves and returns the result. __await__() -> typing.Any: - Awaits the future asynchronously and returns the object once it's ready. + Enables using the Future in async/await syntax. is_completed() -> bool: - Returns True if the Future is already resolved, False otherwise. + Checks if the Future has resolved without blocking. + + + Examples: + Loading ML dataset asynchronously: + ```python + future = deeplake.open_async("s3://ml-data/embeddings") + + # Check status without blocking + if not future.is_completed(): + print("Still loading...") + + # Block until ready + ds = future.result() + ``` + + Using with async/await: + ```python + async def load_data(): + ds = await deeplake.open_async("s3://ml-data/images") + batch = await ds.images.get_async(slice(0, 32)) + return batch + ``` """ def result(self) -> typing.Any: """ - Blocks until the Future is resolved, then returns the result. + Blocks until the Future resolves and returns the result. Returns: - typing.Any: The result when the Future is resolved. + typing.Any: The operation result once resolved. + + + + Examples: + ```python + future = ds["images"].get_async(slice(0, 32)) + batch = future.result() # Blocks until batch is loaded + ``` """ ... def __await__(self) -> typing.Any: """ - Awaits the resolution of the Future asynchronously. + Makes the Future compatible with async/await syntax. + + Examples: - ```python - result = await future - ``` + ```python + async def load_batch(): + batch = await ds["images"].get_async(slice(0, 32)) + ``` Returns: - typing.Any: The result when the Future is resolved. + typing.Any: The operation result once resolved. """ ... def is_completed(self) -> bool: """ - Checks if the Future has been resolved. + Checks if the Future has resolved without blocking. Returns: - bool: True if the Future is resolved, False otherwise. + bool: True if resolved, False if still pending. + + + + Examples: + ```python + future = ds.query_async("SELECT * WHERE label = 'car'") + if future.is_completed(): + results = future.result() + else: + print("Query still running...") + ``` """ ... class FutureVoid: """ - A future that represents the completion of an operation that returns no result. - - The future will resolve automatically to `None`, even if you do not explicitly wait for it. + A Future representing a void async operation in ML pipelines. + + Similar to Future but for operations that don't return values, like saving + or committing changes. Useful for non-blocking data management operations. Methods: wait() -> None: - Blocks until the FutureVoid is resolved and then returns `None`. + Blocks until operation completes. __await__() -> None: - Awaits the FutureVoid asynchronously and returns `None` once the operation is complete. + Enables using with async/await syntax. is_completed() -> bool: - Returns True if the FutureVoid is already resolved, False otherwise. + Checks completion status without blocking. + + + + Examples: + Asynchronous dataset updates: + ```python + # Update embeddings without blocking + future = ds["embeddings"].set_async(slice(0, 32), new_embeddings) + + # Do other work while update happens + process_other_data() + + # Wait for update to complete + future.wait() + ``` + + Using with async/await: + ```python + async def update_dataset(): + await ds.commit_async() + print("Changes saved") + ``` """ def wait(self) -> None: """ - Blocks until the FutureVoid is resolved, then returns `None`. + Blocks until the operation completes. + + Examples: ```python - future_void.wait() # Blocks until the operation completes. + future = ds.commit_async() + future.wait() # Blocks until commit finishes ``` - - Returns: - None: Indicates the operation has completed. """ ... def __await__(self) -> None: """ - Awaits the resolution of the FutureVoid asynchronously. + Makes the FutureVoid compatible with async/await syntax. - Examples: - ```python - await future_void # Waits for the completion of the async operation. - ``` + - Returns: - None: Indicates the operation has completed. + Examples: + ```python + async def save_changes(): + await ds.commit_async() + ``` """ ... def is_completed(self) -> bool: """ - Checks if the FutureVoid has been resolved. + Checks if the operation has completed without blocking. Returns: - bool: True if the FutureVoid is resolved, False otherwise. + bool: True if completed, False if still running. + + + + Examples: + ```python + future = ds.commit_async() + if future.is_completed(): + print("Commit finished") + else: + print("Commit still running...") + ``` """ ... class ReadOnlyMetadata: """ - ReadOnlyMetadata is a key-value store. + Read-only access to dataset and column metadata for ML workflows. + + Stores important information about datasets like: + - Model parameters and hyperparameters + - Preprocessing statistics (mean, std, etc.) + - Data splits and fold definitions + - Version and training information + + + + Examples: + Accessing model metadata: + ```python + metadata = ds.metadata + model_name = metadata["model_name"] + model_params = metadata["hyperparameters"] + ``` + + Reading preprocessing stats: + ```python + mean = ds["images"].metadata["mean"] + std = ds["images"].metadata["std"] + ``` """ def __getitem__(self, key: str) -> typing.Any: """ - Get the value for the given key + Gets metadata value for the given key. + + Args: + key: Metadata key to retrieve + + Returns: + The stored metadata value + + + + Examples: + ```python + mean = ds["images"].metadata["mean"] + std = ds["images"].metadata["std"] + ``` """ ... def keys(self) -> list[str]: """ - Return a list of all keys in the metadata + Lists all available metadata keys. + + Returns: + list[str]: List of metadata key names + + + + Examples: + ```python + # Print all metadata + for key in metadata.keys(): + print(f"{key}: {metadata[key]}") + ``` """ ... class Metadata(ReadOnlyMetadata): """ - Metadata is a key-value store. + Writable access to dataset and column metadata for ML workflows. + + Stores important information about datasets like: + - Model parameters and hyperparameters + - Preprocessing statistics + - Data splits and fold definitions + - Version and training information + + Changes are persisted immediately without requiring `commit()`. + + Examples: + Storing model metadata: + + dataset.metadata["model_name"] = "resnet50" + dataset.metadata["hyperparameters"] = { + "learning_rate": 0.001, + "batch_size": 32 + } + + Setting preprocessing stats: + + dataset.images.metadata["mean"] = [0.485, 0.456, 0.406] + dataset.images.metadata["std"] = [0.229, 0.224, 0.225] """ def __setitem__(self, key: str, value: typing.Any) -> None: """ - Set the value for the given key. Setting the value will immediately persist the change without requiring a commit(). + Sets metadata value for given key. Changes are persisted immediately. + + Args: + key: Metadata key to set + value: Value to store + + Examples: + ```python + ds.metadata["train_split"] = 0.8 + ds.metadata["val_split"] = 0.1 + ds.metadata["test_split"] = 0.1 + ``` """ ... def query(query: str, token: str | None = None) -> DatasetView: """ - Executes a TQL (Tensor Query Language) query and returns a filtered DatasetView. + Executes TQL queries optimized for ML data filtering and search. - TQL provides SQL-like querying capabilities specifically designed for ML datasets, allowing you - to filter, sort, and select data based on various criteria including vector similarity. + TQL is a SQL-like query language designed for ML datasets, supporting: + - Vector similarity search + - Text semantic search + - Complex data filtering + - Joining across datasets + - Efficient sorting and pagination Args: - query: A TQL query string. The query can: - - Filter rows using WHERE clauses - - Sort results using ORDER BY - - Select specific columns using SELECT - - Perform vector similarity search using BM25_SIMILARITY - - Join multiple datasets - token: Optional Activeloop token for authentication. Not required if using environment - credentials. + query: TQL query string supporting: + - Vector similarity: COSINE_SIMILARITY, EUCLIDEAN_DISTANCE + - Text search: BM25_SIMILARITY, CONTAINS + - Filtering: WHERE clauses + - Sorting: ORDER BY + - Joins: JOIN across datasets + token: Optional Activeloop authentication token Returns: - DatasetView: A view containing the query results. The view can be: - - Used directly for ML training + DatasetView: Query results that can be: + - Used directly in ML training - Further filtered with additional queries - Converted to PyTorch/TensorFlow dataloaders - Materialized into a new dataset - Examples: - Basic filtering: - ```python - # Select images with high confidence labels - view = deeplake.query(f'SELECT * FROM "{ds_path}" WHERE confidence > 0.9') - - # Get samples from specific classes - cats = deeplake.query(f'SELECT * FROM "{ds_path}" WHERE label IN (\'cat\', \'kitten\')') - ``` + - Text similarity search: + Examples: + Vector similarity search: ```python - # Find semantically similar text using BM25 - similar = deeplake.query(f''' - SELECT * FROM "{ds_path}" - ORDER BY BM25_SIMILARITY(text_column, 'query text') DESC + # Find similar embeddings + similar = deeplake.query(''' + SELECT * FROM "mem://embeddings" + ORDER BY COSINE_SIMILARITY(vector, ARRAY[0.1, 0.2, 0.3]) DESC LIMIT 100 ''') + + # Use results in training + dataloader = similar.pytorch() ``` - Vector similarity search: + Text semantic search: ```python - # Find nearest neighbor embeddings - neighbors = deeplake.query(f''' - SELECT * FROM "{ds_path}" - ORDER BY COSINE_SIMILARITY(embedding, ARRAY[0.1, 0.2, ...]) DESC + # Search documents using BM25 + relevant = deeplake.query(''' + SELECT * FROM "mem://documents" + ORDER BY BM25_SIMILARITY(text, 'machine learning') DESC LIMIT 10 ''') ``` - Joins across datasets: + Complex filtering: ```python - # Join images with their metadata - results = deeplake.query(f''' - SELECT i.image, m.label, m.bbox - FROM "{image_ds_path}" AS i - JOIN "{metadata_ds_path}" AS m ON i.id = m.image_id - WHERE m.verified = true + # Filter training data + train = deeplake.query(''' + SELECT * FROM "mem://dataset" + WHERE "split" = 'train' + AND confidence > 0.9 + AND label IN ('cat', 'dog') ''') ``` - Using with ML frameworks: + Joins for feature engineering: ```python - # Filter dataset and create PyTorch dataloader - train_data = deeplake.query("SELECT * FROM dataset WHERE split = 'train'") - train_loader = train_data.pytorch().dataloader(batch_size=32) + # Combine image features with metadata + features = deeplake.query(''' + SELECT i.image, i.embedding, m.labels, m.metadata + FROM "mem://images" AS i + JOIN "mem://metadata" AS m ON i.id = m.image_id + WHERE m.verified = true + ''') ``` """ ... def query_async(query: str, token: str | None = None) -> Future: """ - Asynchronously executes a TQL (Tensor Query Language) query and returns a Future that will resolve into DatasetView. + Asynchronously executes TQL queries optimized for ML data filtering and search. - TQL provides SQL-like querying capabilities specifically designed for ML datasets, allowing you - to filter, sort, and select data based on various criteria including vector similarity. + Non-blocking version of `query()` for better performance with large datasets. + Supports the same TQL features including vector similarity search, text search, + filtering, and joins. Args: - query: A TQL query string. The query can: - - Filter rows using WHERE clauses - - Sort results using ORDER BY - - Select specific columns using SELECT - - Perform vector similarity search using BM25_SIMILARITY - - Join multiple datasets - token: Optional Activeloop token for authentication. Not required if using environment - credentials. + query: TQL query string supporting: + - Vector similarity: COSINE_SIMILARITY, EUCLIDEAN_DISTANCE + - Text search: BM25_SIMILARITY, CONTAINS + - Filtering: WHERE clauses + - Sorting: ORDER BY + - Joins: JOIN across datasets + token: Optional Activeloop authentication token Returns: - Future: A Future object that resolves to a DatasetView. The resulting view can be: - - Used directly for ML training + Future: Resolves to DatasetView that can be: + - Used directly in ML training - Further filtered with additional queries - Converted to PyTorch/TensorFlow dataloaders - Materialized into a new dataset - Examples: - Basic filtering with await: - ```python - # Select images with high confidence labels - view = await deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE confidence > 0.9') - - # Get samples from specific classes - cats = await deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE label IN (\'cat\', \'kitten\')') - ``` + - Text similarity search with Future.result(): + Examples: + Basic async query: ```python - # Find semantically similar text using BM25 - future = deeplake.query_async(f''' - SELECT * FROM "{ds_path}" - ORDER BY BM25_SIMILARITY(text_column, 'query text') DESC - LIMIT 100 + # Run query asynchronously + future = deeplake.query_async(''' + SELECT * FROM "mem://embeddings" + ORDER BY COSINE_SIMILARITY(vector, ARRAY[0.1, 0.2, 0.3]) DESC ''') - similar = future.result() # Blocks until query completes - ``` - Vector similarity search: - ```python - # Find nearest neighbor embeddings - neighbors = await deeplake.query_async(f''' - SELECT * FROM "{ds_path}" - ORDER BY COSINE_SIMILARITY(embedding, ARRAY[0.1, 0.2, ...]) DESC - LIMIT 10 - ''') - ``` + # Do other work while query runs + prepare_training() - Joins across datasets: - ```python - # Join images with their metadata - results = await deeplake.query_async(f''' - SELECT i.image, m.label, m.bbox - FROM "{image_ds_path}" AS i - JOIN "{metadata_ds_path}" AS m ON i.id = m.image_id - WHERE m.verified = true - ''') + # Get results when needed + results = future.result() ``` - Using with ML frameworks: + With async/await: ```python - # Filter dataset and create PyTorch dataloader - future = deeplake.query_async(f'SELECT * FROM "{ds_path}" WHERE split = \'train\'') - train_data = future.result() - train_loader = train_data.pytorch().dataloader(batch_size=32) + async def search_similar(): + results = await deeplake.query_async(''' + SELECT * FROM "mem://images" + ORDER BY COSINE_SIMILARITY(embedding, ARRAY[0.1, 0.2, 0.3]) DESC + LIMIT 100 + ''') + return results + + async def main(): + similar = await search_similar() ``` Non-blocking check: ```python - # Check if query is complete without blocking - future = deeplake.query_async(f'SELECT * FROM "{ds_path}"') + future = deeplake.query_async( + "SELECT * FROM dataset WHERE \\"split\\" = 'train'" + ) + if future.is_completed(): - results = future.result() + train_data = future.result() + else: + print("Query still running...") ``` """ ... class Client: + """ + Client for connecting to Activeloop services. + Handles authentication and API communication. + """ endpoint: str class Tag: @@ -635,36 +903,46 @@ class ColumnView: - Access column metadata and properties - Get information about linked data if the column contains references + + Examples: - Load image data from a column for training + Load image data from a column for training: ```python # Access a single image - image = dataset["images"][0] + image = ds["images"][0] # Load a batch of images - batch = dataset["images"][0:32] + batch = ds["images"][0:32] # Async load for better performance - images_future = dataset["images"].get_async(0:32) + images_future = ds["images"].get_async(slice(0, 32)) images = images_future.result() ``` - Access embeddings for similarity search + Access embeddings for similarity search: ```python # Get all embeddings - embeddings = dataset["embeddings"][:] + embeddings = ds["embeddings"][:] # Get specific embeddings by indices - selected = dataset["embeddings"][[1, 5, 10]] + selected = ds["embeddings"][[1, 5, 10]] ``` - Check column properties + Check column properties: ```python # Get column name - name = dataset["images"].name + name = ds["images"].name # Access metadata - if "mean" in dataset["images"].metadata: + if "mean" in ds["images"].metadata.keys(): mean = dataset["images"].metadata["mean"] ``` """ @@ -682,6 +960,16 @@ class ColumnView: Returns: The data at the specified index/indices. Type depends on the column's data type. + + Examples: ```python # Get single item @@ -710,14 +998,26 @@ class ColumnView: Returns: Future: A Future object that resolves to the requested data. + + Examples: ```python # Async batch load - future = column.get_async(0:32) + future = column.get_async(slice(0, 32)) batch = future.result() # Using with async/await - batch = await column.get_async(0:32) + async def load_batch(): + batch = await column.get_async(slice(0, 32)) + return batch ``` """ ... @@ -750,6 +1050,20 @@ class ColumnView: Access the column's metadata. Useful for storing statistics, preprocessing parameters, or other information about the column data. + Returns: + ReadOnlyMetadata: A ReadOnlyMetadata object for reading metadata. + + + Examples: ```python # Access preprocessing parameters @@ -786,32 +1100,49 @@ class Column(ColumnView): - Access and modify column metadata - Handle various data types common in ML: images, embeddings, labels, etc. + + Examples: - Update training labels + Update training labels: ```python # Update single label - dataset["labels"][0] = 1 + ds["labels"][0] = 1 # Update batch of labels - dataset["labels"][0:32] = new_labels + ds["labels"][0:32] = new_labels # Async update for better performance - future = dataset["labels"].set_async(0:32, new_labels) + future = ds["labels"].set_async(slice(0, 32), new_labels) future.wait() ``` - Store image embeddings + Store image embeddings: ```python # Generate and store embeddings embeddings = model.encode(images) - dataset["embeddings"][0:len(embeddings)] = embeddings + ds["embeddings"][0:len(embeddings)] = embeddings ``` - Manage column metadata + Manage column metadata: ```python # Store preprocessing parameters - dataset["images"].metadata["mean"] = [0.485, 0.456, 0.406] - dataset["images"].metadata["std"] = [0.229, 0.224, 0.225] + ds["images"].metadata["mean"] = [0.485, 0.456, 0.406] + ds["images"].metadata["std"] = [0.229, 0.224, 0.225] ``` """ @@ -825,6 +1156,18 @@ class Column(ColumnView): - slice: Range of indices (e.g., 0:10) value: The data to store. Must match the column's data type. + + Examples: ```python # Update single item @@ -850,14 +1193,27 @@ class Column(ColumnView): Returns: FutureVoid: A FutureVoid that completes when the update is finished. + + Examples: ```python # Async batch update - future = column.set_async(0:32, new_batch) + future = column.set_async(slice(0, 32), new_batch) future.wait() # Using with async/await - await column.set_async(0:32, new_batch) + async def update_batch(): + await column.set_async(slice(0, 32), new_batch) ``` """ ... @@ -926,6 +1282,16 @@ class Row: Returns: Future: A Future object that will resolve to the value containing the column data. + + Examples: ```python future = row.get_async("column_name") @@ -955,6 +1321,17 @@ class Row: Returns: FutureVoid: A FutureVoid object that will resolve when the operation is complete. + + Examples: ```python future_void = row.set_async("column_name", new_value) @@ -1004,6 +1381,16 @@ class RowRange: Returns: Future: A Future object that will resolve to the value containing the column data. + + Examples: ```python future = row_range.get_async("column_name") @@ -1033,6 +1420,17 @@ class RowRange: Returns: FutureVoid: A FutureVoid object that will resolve when the operation is complete. + + Examples: ```python future_void = row_range.set_async("column_name", new_value) @@ -1085,6 +1483,16 @@ class RowRangeView: Returns: Future: A Future object that will resolve to the value containing the column data. + + Examples: ```python future = row_range_view.get_async("column_name") @@ -1118,6 +1526,16 @@ class RowView: Returns: Future: A Future object that will resolve to the value containing the column data. + + Examples: ```python future = row_view.get_async("column_name") @@ -1221,7 +1639,6 @@ class DatasetView: # process row pass ``` - """ ... @@ -1239,15 +1656,6 @@ class DatasetView: ```python ds.summary() ``` - - Example Output: - ``` - Dataset length: 5 - Columns: - id : int64 - title : text - embedding: embedding(768) - ``` """ ... @@ -1255,13 +1663,20 @@ class DatasetView: """ Executes the given TQL query against the dataset and return the results as a [deeplake.DatasetView][]. + + Examples: ```python result = ds.query("select * where category == 'active'") for row in result: print("Id is: ", row["id"]) ``` - """ ... @@ -1269,6 +1684,14 @@ class DatasetView: """ Asynchronously executes the given TQL query against the dataset and return a future that will resolve into [deeplake.DatasetView][]. + + Examples: ```python future = ds.query_async("select * where category == 'active'") @@ -1276,11 +1699,12 @@ class DatasetView: for row in result: print("Id is: ", row["id"]) - # or use the Future in an await expression - future = ds.query_async("select * where category == 'active'") - result = await future - for row in result: - print("Id is: ", row["id"]) + async def query_and_process(): + # or use the Future in an await expression + future = ds.query_async("select * where category == 'active'") + result = await future + for row in result: + print("Id is: ", row["id"]) ``` """ ... @@ -1304,14 +1728,23 @@ class DatasetView: Raises: ImportError: If TensorFlow is not installed + + Examples: ```python - ds = deeplake.open("path/to/dataset") - dl = ds.tensorflow().shuffle(500).batch(32). - for i_batch, sample_batched in enumerate(dataloader): + dl = ds.tensorflow().shuffle(500).batch(32) + for i_batch, sample_batched in enumerate(dl): process_batch(sample_batched) ``` - """ ... @@ -1327,17 +1760,27 @@ class DatasetView: Raises: ImportError: If pytorch is not installed + + Examples: ```python from torch.utils.data import DataLoader - ds = deeplake.open("path/to/dataset") - dataloader = DataLoader(ds.pytorch(), batch_size=60, - shuffle=True, num_workers=10) - for i_batch, sample_batched in enumerate(dataloader): - process_batch(sample_batched) + dl = DataLoader(ds.pytorch(), batch_size=60, + shuffle=True, num_workers=8) + for i_batch, sample_batched in enumerate(dl): + process_batch(sample_batched) ``` - """ ... @@ -1349,8 +1792,8 @@ class DatasetView: batch_size: Number of rows in each batch drop_last: Whether to drop the final batch if it is incomplete - Examples: - ```python + Examples: + ```python ds = deeplake.open("al://my_org/dataset") batches = ds.batches(batch_size=2000, drop_last=True) for batch in batches: @@ -1479,6 +1922,15 @@ class Dataset(DatasetView): - `tuple`: A tuple of indices specifying the rows to return. Returns a [deeplake.RowRange][] - `str`: A string specifying column to return all values from. Returns a [deeplake.Column][] + + Examples: ```python row = ds[318] @@ -1493,7 +1945,6 @@ class Dataset(DatasetView): column_data = ds["id"] ``` - """ ... @@ -1507,7 +1958,6 @@ class Dataset(DatasetView): # process row pass ``` - """ ... @@ -1550,7 +2000,7 @@ class Dataset(DatasetView): ```python ds.add_column("labels", deeplake.types.Int32) - ds.add_column("labels", "int32") + ds.add_column("categories", "int32") ds.add_column("name", deeplake.types.Text()) @@ -1558,7 +2008,7 @@ class Dataset(DatasetView): ds.add_column("images", deeplake.types.Image(dtype=deeplake.types.UInt8(), sample_compression="jpeg")) - ds.add_column("embedding", deeplake.types.Embedding(dtype=deeplake.types.Float32(), dimensions=768)) + ds.add_column("embedding", deeplake.types.Embedding(size=768)) ``` Raises: @@ -1572,6 +2022,14 @@ class Dataset(DatasetView): Args: name: The name of the column to remove + + Examples: ```python ds.remove_column("name") @@ -1589,6 +2047,14 @@ class Dataset(DatasetView): name: The name of the column to rename new_name: The new name to set to column + + Examples: ```python ds.rename_column("old_name", "new_name") @@ -1620,21 +2086,37 @@ class Dataset(DatasetView): Args: data: The data to insert into the dataset. + + Examples: ```python ds.append({"name": ["Alice", "Bob"], "age": [25, 30]}) ds.append([{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]) + ``` - ds.append({ + ```python + ds2.append({ "embedding": np.random.rand(4, 768), "text": ["Hello World"] * 4}) - ds.append([{"embedding": np.random.rand(768), "text": "Hello World"}] * 4) + ds2.append([{"embedding": np.random.rand(768), "text": "Hello World"}] * 4) ``` ```python - ds.append(deeplake.from_parquet("./file.parquet")) + ds2.append(deeplake.from_parquet("./file.parquet")) ``` Raises: @@ -1662,12 +2144,9 @@ class Dataset(DatasetView): Examples: ```python ds.commit() - ``` - ```python ds.commit("Added data from updated documents") ``` - """ def commit_async(self, message: str | None = None) -> FutureVoid: @@ -1682,21 +2161,12 @@ class Dataset(DatasetView): Examples: ```python ds.commit_async().wait() - ``` - ```python ds.commit_async("Added data from updated documents").wait() - ``` - ```python - await ds.commit_async() - ``` - - ```python - await ds.commit_async("Added data from updated documents") - ``` + async def do_commit(): + await ds.commit_async() - ```python future = ds.commit_async() # then you can check if the future is completed using future.is_completed() ``` """ @@ -1807,7 +2277,6 @@ class ReadOnlyDataset(DatasetView): # process row pass ``` - """ ... @@ -1953,6 +2422,12 @@ class InvalidPolygonShapeError(Exception): class InvalidLinkDataError(Exception): pass +class InvalidCredsKeyAssignmentError(Exception): + pass + +class CredsKeyAlreadyAssignedError(Exception): + pass + class GcsStorageProviderFailed(Exception): pass @@ -2055,12 +2530,6 @@ class UnsupportedChunkCompression(Exception): class InvalidImageCompression(Exception): pass -class InvalidCredsKeyAssignmentError(Exception): - pass - -class CredsKeyAlreadyAssignedError(Exception): - pass - class InvalidSegmentMaskCompression(Exception): pass @@ -2206,11 +2675,21 @@ def create( token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. schema (dict): The initial schema to use for the dataset. See `deeplake.schema` such as [deeplake.schemas.TextEmbeddings][] for common starting schemas. + + Examples: ```python - import deeplake - from deeplake import types - # Create a dataset in your local filesystem: ds = deeplake.create("directory_path") ds.add_column("id", types.Int32()) @@ -2219,42 +2698,23 @@ def create( ds.commit() ds.summary() ``` - Output: - ``` - Dataset length: 0 - Columns: - id : int32 - url : text - embedding: embedding(768) - ``` ```python # Create dataset in your app.activeloop.ai organization: ds = deeplake.create("al://organization_id/dataset_name") - ``` - ```python # Create a dataset stored in your cloud using specified credentials: ds = deeplake.create("s3://mybucket/my_dataset", - creds = {"aws_access_key_id": ..., ...}) - ``` + creds = {"aws_access_key_id": id, "aws_secret_access_key": key}) - ```python # Create dataset stored in your cloud using app.activeloop.ai managed credentials. ds = deeplake.create("s3://mybucket/my_dataset", creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") - ``` - ```python - # Create dataset stored in your cloud using app.activeloop.ai managed credentials. ds = deeplake.create("azure://bucket/path/to/dataset") - ``` - ```python ds = deeplake.create("gcs://bucket/path/to/dataset") - ``` - ```python ds = deeplake.create("mem://in-memory") ``` @@ -2275,54 +2735,50 @@ def create_async( To open an existing dataset, use [deeplake.open_async][]. + + Examples: ```python - import deeplake - from deeplake import types - - # Asynchronously create a dataset in your local filesystem: - ds = await deeplake.create_async("directory_path") - await ds.add_column("id", types.Int32()) - await ds.add_column("url", types.Text()) - await ds.add_column("embedding", types.Embedding(768)) - await ds.commit() - await ds.summary() # Example of usage in an async context - ``` + async def create_dataset(): + # Asynchronously create a dataset in your local filesystem: + ds = await deeplake.create_async("directory_path") + await ds.add_column("id", types.Int32()) + await ds.add_column("url", types.Text()) + await ds.add_column("embedding", types.Embedding(768)) + await ds.commit() + await ds.summary() # Example of usage in an async context - ```python - # Alternatively, create a dataset using .result(). - future_ds = deeplake.create_async("directory_path") - ds = future_ds.result() # Blocks until the dataset is created - ``` + # Alternatively, create a dataset using .result(). + future_ds = deeplake.create_async("directory_path") + ds = future_ds.result() # Blocks until the dataset is created - ```python - # Create a dataset in your app.activeloop.ai organization: - ds = await deeplake.create_async("al://organization_id/dataset_name") - ``` + # Create a dataset in your app.activeloop.ai organization: + ds = await deeplake.create_async("al://organization_id/dataset_name") - ```python - # Create a dataset stored in your cloud using specified credentials: - ds = await deeplake.create_async("s3://mybucket/my_dataset", - creds={"aws_access_key_id": ..., ...}) - ``` + # Create a dataset stored in your cloud using specified credentials: + ds = await deeplake.create_async("s3://mybucket/my_dataset", + creds={"aws_access_key_id": id, "aws_secret_access_key": key}) - ```python - # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - ds = await deeplake.create_async("s3://mybucket/my_dataset", - creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - ``` + # Create dataset stored in your cloud using app.activeloop.ai managed credentials. + ds = await deeplake.create_async("s3://mybucket/my_dataset", + creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - ```python - # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - ds = await deeplake.create_async("azure://bucket/path/to/dataset") - ``` + ds = await deeplake.create_async("azure://bucket/path/to/dataset") - ```python - ds = await deeplake.create_async("gcs://bucket/path/to/dataset") - ``` + ds = await deeplake.create_async("gcs://bucket/path/to/dataset") - ```python - ds = await deeplake.create_async("mem://in-memory") + ds = await deeplake.create_async("mem://in-memory") ``` Raises: @@ -2348,11 +2804,18 @@ def copy( dst_creds (dict, str, optional): The string ``ENV`` or a dictionary containing credentials used to access the destination dataset at the path. token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. + + Examples: ```python deeplake.copy("al://organization_id/source_dataset", "al://organization_id/destination_dataset") ``` - """ def delete( @@ -2388,7 +2851,7 @@ def open( See [deeplake.open_read_only][] for opening the dataset in read only mode - To create a new dataset, see [deeplake.open][] + To create a new dataset, see [deeplake.create][] Args: url: The URL of the dataset. URLs can be specified using the following protocols: @@ -2410,33 +2873,32 @@ def open( - If nothing is given is, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. + + Examples: ```python # Load dataset managed by Deep Lake. ds = deeplake.open("al://organization_id/dataset_name") - ``` - ```python # Load dataset stored in your cloud using your own credentials. ds = deeplake.open("s3://bucket/my_dataset", - creds = {"aws_access_key_id": ..., ...}) - ``` + creds = {"aws_access_key_id": id, "aws_secret_access_key": key}) - ```python # Load dataset stored in your cloud using Deep Lake managed credentials. ds = deeplake.open("s3://bucket/my_dataset", - ...creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") - ``` + creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") - ```python ds = deeplake.open("s3://bucket/path/to/dataset") - ``` - ```python ds = deeplake.open("azure://bucket/path/to/dataset") - ``` - ```python ds = deeplake.open("gcs://bucket/path/to/dataset") ``` """ @@ -2451,38 +2913,27 @@ def open_async( Examples: ```python - # Asynchronously load dataset managed by Deep Lake using await. - ds = await deeplake.open_async("al://organization_id/dataset_name") - ``` + async def async_open(): + # Asynchronously load dataset managed by Deep Lake using await. + ds = await deeplake.open_async("al://organization_id/dataset_name") - ```python - # Asynchronously load dataset stored in your cloud using your own credentials. - ds = await deeplake.open_async("s3://bucket/my_dataset", - creds={"aws_access_key_id": ..., ...}) - ``` + # Asynchronously load dataset stored in your cloud using your own credentials. + ds = await deeplake.open_async("s3://bucket/my_dataset", + creds={"aws_access_key_id": id, "aws_secret_access_key": key}) - ```python - # Asynchronously load dataset stored in your cloud using Deep Lake managed credentials. - ds = await deeplake.open_async("s3://bucket/my_dataset", - creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - ``` + # Asynchronously load dataset stored in your cloud using Deep Lake managed credentials. + ds = await deeplake.open_async("s3://bucket/my_dataset", + creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - ```python - ds = await deeplake.open_async("s3://bucket/path/to/dataset") - ``` + ds = await deeplake.open_async("s3://bucket/path/to/dataset") - ```python - ds = await deeplake.open_async("azure://bucket/path/to/dataset") - ``` + ds = await deeplake.open_async("azure://bucket/path/to/dataset") - ```python - ds = await deeplake.open_async("gcs://bucket/path/to/dataset") - ``` + ds = await deeplake.open_async("gcs://bucket/path/to/dataset") - ```python - # Alternatively, load the dataset using .result(). - future_ds = deeplake.open_async("al://organization_id/dataset_name") - ds = future_ds.result() # Blocks until the dataset is loaded + # Alternatively, load the dataset using .result(). + future_ds = deeplake.open_async("al://organization_id/dataset_name") + ds = future_ds.result() # Blocks until the dataset is loaded ``` """ @@ -2509,12 +2960,18 @@ def like( - If nothing is given is, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. + + Examples: ```python ds = deeplake.like(src="az://bucket/existing/to/dataset", dest="s3://bucket/new/dataset") ``` - """ def connect( @@ -2538,33 +2995,30 @@ def connect( creds_key (str, optional): The creds_key of the managed credentials that will be used to access the source path. If not set, use the organization's default credentials. token (str, optional): Activeloop token used to fetch the managed credentials. + + Examples: ```python ds = deeplake.connect("s3://bucket/path/to/dataset", "al://my_org/dataset") - ``` - ```python ds = deeplake.connect("s3://bucket/path/to/dataset", "al://my_org/dataset", creds_key="my_key") - ``` - ```python # Connect the dataset as al://my_org/dataset ds = deeplake.connect("s3://bucket/path/to/dataset", org_id="my_org") - ``` - ```python ds = deeplake.connect("az://bucket/path/to/dataset", "al://my_org/dataset", creds_key="my_key") - ``` - ```python ds = deeplake.connect("gcs://bucket/path/to/dataset", "al://my_org/dataset", creds_key="my_key") - ``` - """ def disconnect(url: str, token: str | None = None) -> None: @@ -2584,7 +3038,6 @@ def disconnect(url: str, token: str | None = None) -> None: ```python deeplake.disconnect("al://my_org/dataset_name") ``` - """ def open_read_only( @@ -2618,39 +3071,26 @@ def open_read_only( token (str, optional): Activeloop token to authenticate user. Examples: - ```python + ds = deeplake.open_read_only("directory_path") ds.summary() - ``` Example Output: - ``` Dataset length: 5 Columns: id : int32 url : text embedding: embedding(768) - ``` - ```python ds = deeplake.open_read_only("file:///path/to/dataset") - ``` - ```python ds = deeplake.open_read_only("s3://bucket/path/to/dataset") - ``` - ```python ds = deeplake.open_read_only("azure://bucket/path/to/dataset") - ``` - ```python ds = deeplake.open_read_only("gcs://bucket/path/to/dataset") - ``` - ```python ds = deeplake.open_read_only("mem://in-memory") - ``` """ def open_read_only_async( @@ -2662,36 +3102,69 @@ def open_read_only_async( See [deeplake.open_async][] for opening datasets for modification and [deeplake.open_read_only][] for sync open. Examples: - ```python + # Asynchronously open a dataset in read-only mode: ds = await deeplake.open_read_only_async("directory_path") - ``` - ```python # Alternatively, open the dataset using .result(). future_ds = deeplake.open_read_only_async("directory_path") ds = future_ds.result() # Blocks until the dataset is loaded - ``` - ```python ds = await deeplake.open_read_only_async("file:///path/to/dataset") - ``` - ```python ds = await deeplake.open_read_only_async("s3://bucket/path/to/dataset") - ``` - ```python ds = await deeplake.open_read_only_async("azure://bucket/path/to/dataset") - ``` - ```python ds = await deeplake.open_read_only_async("gcs://bucket/path/to/dataset") - ``` - ```python ds = await deeplake.open_read_only_async("mem://in-memory") + """ + +def convert( + src: str, + dst: str, + dst_creds: dict[str, str] | None = None, + token: str | None = None +) -> None: + """ + Converts a Deep Lake v3 dataset to the new v4 format while preserving data and metadata. + Optimized for ML workloads with efficient handling of large datasets and linked data. + + Args: + src: URL of the source v3 dataset to convert + dst: Destination URL for the new v4 dataset. Supports: + - `file://path` local storage + - `s3://bucket/path` S3 storage + - `gs://bucket/path` Google Cloud storage + - `azure://bucket/path` Azure storage + dst_creds: Optional credentials for accessing the destination storage. + Supports cloud provider credentials like access keys + token: Optional Activeloop authentication token + + <-- test-context + ```python + import deeplake + deeplake.convert = lambda src, dst, dst_creds = None, token = None: None + ``` + --> + + Examples: + ```python + # Convert local dataset + deeplake.convert("old_dataset/", "new_dataset/") + + # Convert cloud dataset with credentials + deeplake.convert( + "s3://old-bucket/dataset", + "s3://new-bucket/dataset", + dst_creds={"aws_access_key_id": "key", + "aws_secret_access_key": "secret"} + ) ``` + + Notes: + - You can open v3 dataset without converting it to v4 using `deeplake.query('SELECT * FROM "old_dataset/"')` """ def from_parquet(url: str) -> ReadOnlyDataset: @@ -2702,6 +3175,4 @@ def from_parquet(url: str) -> ReadOnlyDataset: url: The URL of the Parquet dataset. If no protocol is specified, it assumes `file://` """ -def __child_atfork() -> None: ... -def __parent_atfork() -> None: ... def __prepare_atfork() -> None: ... diff --git a/python/deeplake/ingestion/__init__.py b/python/deeplake/ingestion/__init__.py new file mode 100644 index 0000000000..698af773c9 --- /dev/null +++ b/python/deeplake/ingestion/__init__.py @@ -0,0 +1 @@ +from deeplake.ingestion.coco.ingest_coco import ingest_coco \ No newline at end of file diff --git a/python/deeplake/ingestion/coco/__init__.py b/python/deeplake/ingestion/coco/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/deeplake/ingestion/coco/exceptions.py b/python/deeplake/ingestion/coco/exceptions.py new file mode 100644 index 0000000000..4703dc9a4f --- /dev/null +++ b/python/deeplake/ingestion/coco/exceptions.py @@ -0,0 +1,11 @@ +class CocoAnnotationMissingError(Exception): + def __init__(self, keys): + super().__init__( + ( + "COCO dataset ingestion expects to have `instances`, `keypoints` and `stuff`. " + "{} {} missing." + ).format( + f"Key {keys[0]}" if len(keys) == 1 else f"Keys {', '.join(keys)}", + "is" if len(keys) == 1 else "are", + ) + ) diff --git a/python/deeplake/ingestion/coco/ingest_coco.py b/python/deeplake/ingestion/coco/ingest_coco.py new file mode 100644 index 0000000000..f3fe86db84 --- /dev/null +++ b/python/deeplake/ingestion/coco/ingest_coco.py @@ -0,0 +1,366 @@ +from typing import Union, Optional, List, Dict +import pathlib +from deeplake.ingestion.coco.exceptions import CocoAnnotationMissingError +import deeplake as dp +import numpy as np +from tqdm import tqdm +import os + +COCO_REQUIRED_KEYS = ["instances", "keypoints", "stuff"] +MASKS_NOTE = "All segmentation polygons and RLEs were converted to stacked binary masks" + + +def convert_pathlib_to_string_if_needed(path: Union[str, pathlib.Path]) -> str: + if isinstance(path, pathlib.Path): + path = str(path) + return path + + +def verify_coco_annotation_dict( + annotation_files: Dict[str, Union[str, pathlib.Path]] = {} +): + if all(key in annotation_files for key in COCO_REQUIRED_KEYS): + return { + key: convert_pathlib_to_string_if_needed(value) + for key, value in annotation_files.items() + } + else: + raise CocoAnnotationMissingError( + list(COCO_REQUIRED_KEYS - annotation_files.keys()) + ) + + +class COCOStructuredDataset: + def __init__( + self, + dataset: dp.Dataset = None, + images_directory: Union[str, pathlib.Path] = None, + annotation_files: Dict[str, Union[str, pathlib.Path]] = {}, + ): + from pycocotools.coco import COCO + + self.dataset = dataset + self.images_directory = images_directory + self.annotation_files = annotation_files + + self.coco = COCO(self.annotation_files["instances"]) + self.coco_kp = COCO(self.annotation_files["keypoints"]) + self.coco_stuff = COCO(self.annotation_files["stuff"]) + + self.category_info = self.coco.loadCats(self.coco.getCatIds()) + self.category_info_kp = self.coco_kp.loadCats(self.coco_kp.getCatIds()) + self.category_info_stuff = self.coco_stuff.loadCats(self.coco_stuff.getCatIds()) + self.img_ids = sorted(self.coco.getImgIds()) # Image ids for uploading + + self.cat_names = [category["name"] for category in self.category_info] + self.super_cat_names = list( + set([category["supercategory"] for category in self.category_info]) + ) + self.cat_names_kp = [category["name"] for category in self.category_info_kp] + self.super_cat_names_kp = list( + set([category["supercategory"] for category in self.category_info_kp]) + ) + self.cat_names_stuff = [ + category["name"] for category in self.category_info_stuff + ] + self.super_cat_names_stuff = list( + set([category["supercategory"] for category in self.category_info_stuff]) + ) + + def get_kp_group_data(self, height, width, anns_kp): + # Iterate through keypoints and parse each + categories_kp = np.zeros((len(anns_kp))) + supercats_kp = np.zeros((len(anns_kp))) + masks_kp = np.zeros((height, width, len(anns_kp))) + boxes_kp = np.zeros((len(anns_kp), 4)) + keypoints_kp = np.zeros((51, len(anns_kp))) + + for j, ann_kp in enumerate(anns_kp): + categories_kp[j] = self.cat_names_kp.index( + [ + self.category_info_kp[i]["name"] + for i in range(len(self.category_info_kp)) + if self.category_info_kp[i]["id"] == ann_kp["category_id"] + ][0] + ) + supercats_kp[j] = self.super_cat_names_kp.index( + [ + self.category_info_kp[i]["supercategory"] + for i in range(len(self.category_info_kp)) + if self.category_info_kp[i]["id"] == ann_kp["category_id"] + ][0] + ) + mask_kp = self.coco.annToMask(ann_kp) # Convert annotation to mask + masks_kp[:, :, j] = mask_kp + boxes_kp[j, :] = ann_kp["bbox"] + keypoints_kp[:, j] = np.array(ann_kp["keypoints"]) + + return categories_kp, supercats_kp, masks_kp, boxes_kp, keypoints_kp + + def get_stuff_group_data(self, height, width, ann, anns_stuff): + # Iterate through stuff and parse each + masks_stuff = np.zeros((height, width, len(anns_stuff))) + boxes_stuff = np.zeros((len(anns_stuff), 4)) + categories_stuff = np.zeros((len(anns_stuff))) + areas_stuff = np.zeros((len(anns_stuff))) + iscrowds_stuff = np.zeros((len(anns_stuff))) + supercats_stuff = np.zeros((len(anns_stuff))) + + for k, ann_stuff in enumerate(anns_stuff): + mask_stuff = self.coco.annToMask(ann_stuff) # Convert annotation to mask + masks_stuff[:, :, k] = mask_stuff + boxes_stuff[k, :] = ann["bbox"] + + # Do a brute force search and make no assumptions between order of relationship of category ids + categories_stuff[k] = self.cat_names_stuff.index( + [ + self.category_info_stuff[i]["name"] + for i in range(len(self.category_info_stuff)) + if self.category_info_stuff[i]["id"] == ann_stuff["category_id"] + ][0] + ) + supercats_stuff[k] = self.super_cat_names_stuff.index( + [ + self.category_info_stuff[i]["supercategory"] + for i in range(len(self.category_info_stuff)) + if self.category_info_stuff[i]["id"] == ann_stuff["category_id"] + ][0] + ) + + areas_stuff[k] = ann_stuff["area"] + iscrowds_stuff[k] = ann_stuff["iscrowd"] + + if "segmentation" not in ann_stuff: + print("----No segmentation found. Exiting.------") + print("Annotation length: {}".format(len(anns_stuff))) + print("----image id: {}----".format(img_id)) + print("----Exiting.------") + + return ( + masks_stuff, + boxes_stuff, + categories_stuff, + areas_stuff, + iscrowds_stuff, + supercats_stuff, + ) + + def create_structure(self): + self.dataset.add_column( + "images", dp.types.Image(dp.types.UInt8(), sample_compression="jpg") + ) + self.dataset.add_column("masks", dp.types.BinaryMask(sample_compression="lz4")) + self.dataset.add_column( + "boxes", dp.types.BoundingBox(dp.types.Float32(), "ltrb", "pixel") + ) + self.dataset.add_column( + "categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["categories"].metadata["class_names"] = self.cat_names + self.dataset.add_column( + "super_categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["super_categories"].metadata["class_names"] = self.super_cat_names + self.dataset.add_column("areas", dp.types.Array("uint32", 1)) + self.dataset.add_column("iscrowds", dp.types.Array("bool", 1)) + self.dataset.add_column("images_meta", dp.types.Dict()) + + # Pose + self.dataset.add_column( + "pose/categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["pose/categories"].metadata["class_names"] = self.cat_names_kp + self.dataset.add_column( + "pose/super_categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["pose/super_categories"].metadata[ + "class_names" + ] = self.super_cat_names_kp + self.dataset.add_column( + "pose/boxes", dp.types.BoundingBox(dp.types.Float32(), "LTWH", "pixel") + ) + self.dataset.add_column( + "pose/keypoints", dp.types.Array("int32", 2) + ) # htype="keypoints_coco" + self.dataset.add_column( + "pose/masks", dp.types.BinaryMask(sample_compression="lz4") + ) + + # Stuff + self.dataset.add_column( + "stuff/masks", dp.types.BinaryMask(sample_compression="lz4") + ) + self.dataset.add_column( + "stuff/boxes", dp.types.BoundingBox(dp.types.Float32(), "LTWH", "pixel") + ) + self.dataset.add_column( + "stuff/categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["stuff/categories"].metadata["class_names"] = self.cat_names_stuff + self.dataset.add_column( + "stuff/super_categories", dp.types.ClassLabel(dp.types.Array("uint32", 1)) + ) + self.dataset["stuff/super_categories"].metadata[ + "class_names" + ] = self.super_cat_names_stuff + self.dataset.add_column("stuff/areas", dp.types.Array("uint32", 1)) + self.dataset.add_column("stuff/iscrowds", dp.types.Array("bool", 1)) + + # update metadatas + self.dataset["categories"].metadata["category_info"] = self.category_info + self.dataset["categories"].metadata[ + "notes" + ] = "Numeric labels for categories represent the position of the class in the ds[categories].medatata['class_names'] list, and not the COCO category id." + self.dataset["super_categories"].metadata["category_info"] = self.category_info + self.dataset["super_categories"].metadata[ + "notes" + ] = "Numeric labels for categories represent the position of the class in the ds[super_categories].medatata['class_names'] list, and not the COCO category id." + + self.dataset["masks"].metadata["notes"] = MASKS_NOTE + self.dataset["pose/masks"].metadata["category_info"] = self.category_info_kp + self.dataset["pose/masks"].metadata["notes"] = MASKS_NOTE + self.dataset["pose/keypoints"].metadata["keypoints"] = [ + category["keypoints"] for category in self.category_info_kp + ][0] + self.dataset["pose/keypoints"].metadata["connections"] = [ + category["skeleton"] for category in self.category_info_kp + ][0] + + self.dataset["stuff/masks"].metadata["category_info"] = self.category_info_stuff + self.dataset["stuff/masks"].metadata["notes"] = MASKS_NOTE + + def ingest_columns(self): + for ii, img_id in enumerate(tqdm(self.img_ids), start=1): + ann_ids = self.coco.getAnnIds(img_id) + ann_ids_kp = self.coco_kp.getAnnIds(img_id) + ann_ids_stuff = self.coco_stuff.getAnnIds(img_id) + anns = self.coco.loadAnns(ann_ids) + anns_kp = self.coco_kp.loadAnns(ann_ids_kp) + anns_stuff = self.coco_stuff.loadAnns(ann_ids_stuff) + + img_coco = self.coco.loadImgs(img_id)[0] + img_path = os.path.join(self.images_directory, img_coco["file_name"]) + with open(img_path, "rb") as file: + image_bytes = file.read() + (height, width) = (img_coco["height"], img_coco["width"]) + masks = np.zeros((height, width, len(anns))) + boxes = np.zeros((len(anns), 4)) + categories = np.zeros((len(anns))) + areas = np.zeros((len(anns))) + iscrowds = np.zeros((len(anns))) + supercats = np.zeros((len(anns))) + + for i, ann in enumerate(anns): + mask = self.coco.annToMask(ann) + masks[:, :, i] = mask + boxes[i, :] = ann["bbox"] + + categories[i] = self.cat_names.index( + [ + self.category_info[i]["name"] + for i in range(len(self.category_info)) + if self.category_info[i]["id"] == ann["category_id"] + ][0] + ) + supercats[i] = self.super_cat_names.index( + [ + self.category_info[i]["supercategory"] + for i in range(len(self.category_info)) + if self.category_info[i]["id"] == ann["category_id"] + ][0] + ) + + areas[i] = ann["area"] + iscrowds[i] = ann["iscrowd"] + + if "segmentation" not in ann: + print("----No segmentation found. Exiting.------") + print("Annotation length: {}".format(len(anns))) + print("----image id: {}----".format(img_id)) + print("----Exiting.------") + + (categories_kp, supercats_kp, masks_kp, boxes_kp, keypoints_kp) = ( + self.get_kp_group_data(height, width, anns_kp) + ) + + ( + masks_stuff, + boxes_stuff, + categories_stuff, + areas_stuff, + iscrowds_stuff, + supercats_stuff, + ) = self.get_stuff_group_data(height, width, ann, anns_stuff) + + in_dict = { + "images": [image_bytes], + "images_meta": [img_coco], + "masks": [masks.astype("bool")], + "boxes": [boxes.astype("float32")], + "categories": [categories.astype("uint32")], + "super_categories": [supercats.astype("uint32")], + "areas": [areas.astype("uint32")], + "iscrowds": [iscrowds.astype("bool")], + "pose/categories": [categories_kp.astype("uint32")], + "pose/super_categories": [supercats_kp.astype("uint32")], + "pose/boxes": [boxes_kp.astype("float32")], + "pose/masks": [masks_kp.astype("bool")], + "pose/keypoints": [keypoints_kp.astype("int32")], + "stuff/masks": [masks_stuff.astype("bool")], + "stuff/boxes": [boxes_stuff.astype("float32")], + "stuff/categories": [categories_stuff.astype("uint32")], + "stuff/super_categories": [supercats_stuff.astype("uint32")], + "stuff/areas": [areas_stuff.astype("uint32")], + "stuff/iscrowds": [iscrowds_stuff.astype("bool")], + } + self.dataset.append(in_dict) + self.dataset.commit("Finished ingestion") + + def structure(self): + self.create_structure() + self.ingest_columns() + + +def ingest_coco( + images_directory: Union[str, pathlib.Path], + annotation_files: Dict[str, Union[str, pathlib.Path]], + dest: Union[str, pathlib.Path], + dest_creds: Optional[Dict[str, str]] = None, +): + """Ingest images and annotations in COCO format to a Deep Lake Dataset. The source data can be stored locally or in the cloud. + + Args: + images_directory (str, pathlib.Path): The path to the directory containing images. + annotation_files Dict(str, Union[str, pathlib.Path]): dictionary from key to path to JSON annotation file in COCO format. + - the required keys are the following `instances`, `keypoints` and `stuff` + dest (str, pathlib.Path): + - The full path to the dataset. Can be: + - a Deep Lake cloud path of the form ``al://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are authenticated to Deep Lake (pass in a token using the 'token' parameter). + - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument. + - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``. + - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist. + dest_creds (Optional[Dict[str, str]]): The dictionary containing credentials used to access the destination path of the dataset. + + Returns: + Dataset: The Dataset created from images and COCO annotations. + + Raises: + CocoAnnotationMissingError: If one or many annotation key is missing from file. + """ + + dest = convert_pathlib_to_string_if_needed(dest) + images_directory = convert_pathlib_to_string_if_needed(images_directory) + + annotation_files = verify_coco_annotation_dict(annotation_files) + + dist_ds = dp.create(dest, dict(dest_creds) if dest_creds is not None else {}) + + unstructured = COCOStructuredDataset( + dataset=dist_ds, + images_directory=images_directory, + annotation_files=annotation_files, + ) + + unstructured.structure() + + return dist_ds diff --git a/python/deeplake/integrations/__init__.py b/python/deeplake/integrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/deeplake/integrations/constants.py b/python/deeplake/integrations/constants.py new file mode 100644 index 0000000000..32b2f7dc15 --- /dev/null +++ b/python/deeplake/integrations/constants.py @@ -0,0 +1,5 @@ +# constant showing the GPU memory cleanup interval +TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING = 10 * 60 + +# DEEPLAKE_AUTH_TOKEN holds the value of the ACTIVELOOP_TOKEN environment variable +DEEPLAKE_AUTH_TOKEN = "ACTIVELOOP_TOKEN" diff --git a/python/deeplake/integrations/mm/__init__.py b/python/deeplake/integrations/mm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/deeplake/integrations/mm/exceptions.py b/python/deeplake/integrations/mm/exceptions.py new file mode 100644 index 0000000000..dffa82406c --- /dev/null +++ b/python/deeplake/integrations/mm/exceptions.py @@ -0,0 +1,24 @@ +class EmptyTokenException(Exception): + def __init__(self, message="The authentication token is empty."): + super().__init__(message) + + +class ValidationDatasetMissingError(Exception): + def __init__(self): + msg = ( + "Validation dataset is not specified even though validate = True. " + "Please set validate = False or specify a validation dataset." + ) + super().__init__(msg) + + +class InvalidImageError(Exception): + def __init__(self, column_name, ex): + msg = f"Error on {column_name} data getting: {str(ex)}" + super().__init__(msg) + + +class InvalidSegmentError(Exception): + def __init__(self, column_name, ex): + msg = f"Error on {column_name} data getting: {str(ex)}" + super().__init__(msg) diff --git a/python/deeplake/integrations/mm/get_indexes.py b/python/deeplake/integrations/mm/get_indexes.py new file mode 100644 index 0000000000..74ae89bba7 --- /dev/null +++ b/python/deeplake/integrations/mm/get_indexes.py @@ -0,0 +1,66 @@ +import math +from typing import Optional + + +def get_indexes( + dataset, + rank: Optional[int] = None, + num_replicas: Optional[int] = None, + drop_last: Optional[bool] = None, +): + """ + Generates a slice for a given rank in a distributed setting, dividing + the dataset evenly across multiple replicas. + + Parameters: + dataset (Dataset): The dataset to split across distributed replicas. + rank (Optional[int]): The rank of the current process. If not specified, + the function will use the distributed package to get the current rank. + num_replicas (Optional[int]): Total number of replicas (i.e., processes) involved in distributed training. + If not specified, the function will determine the number based on the world size. + drop_last (Optional[bool]): If True, drop the extra data not evenly divisible among replicas. + This is useful for maintaining equal batch sizes across replicas. + + Returns: + slice: A slice object representing the start and end indices for the current rank's portion of the dataset. + + Raises: + RuntimeError: If the distributed package is not available when `rank` or `num_replicas` are not specified. + ValueError: If the specified `rank` is out of range based on the number of replicas. + + Notes: + This function requires the `torch.distributed` package to determine the number of replicas and + rank when they are not provided. It is useful in distributed data loading to ensure each process + gets a specific subset of the data. + """ + import torch.distributed as dist + + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1) + ) + + dataset_length = len(dataset) + + if drop_last: + total_size = (dataset_length // num_replicas) * num_replicas + per_process = total_size // num_replicas + else: + per_process = math.ceil(dataset_length / num_replicas) + total_size = per_process * num_replicas + + start_index = rank * per_process + end_index = min(start_index + per_process, total_size) + + end_index = min(end_index, dataset_length) + + return slice(start_index, end_index) diff --git a/python/deeplake/integrations/mm/ipc.py b/python/deeplake/integrations/mm/ipc.py new file mode 100644 index 0000000000..71e418db7f --- /dev/null +++ b/python/deeplake/integrations/mm/ipc.py @@ -0,0 +1,6 @@ +import socketserver + + +def _get_free_port() -> int: + with socketserver.TCPServer(("localhost", 0), None) as s: # type: ignore + return s.server_address[1] diff --git a/python/deeplake/integrations/mm/mm_common.py b/python/deeplake/integrations/mm/mm_common.py new file mode 100644 index 0000000000..dcaa5c639d --- /dev/null +++ b/python/deeplake/integrations/mm/mm_common.py @@ -0,0 +1,220 @@ +import os +import torch +import warnings +import mmcv # type: ignore +import deeplake as dp +from deeplake.types import TypeKind +from deeplake.integrations.mm.warnings import always_warn +from deeplake.integrations.mm.exceptions import EmptyTokenException +from deeplake.integrations.constants import DEEPLAKE_AUTH_TOKEN + + +def ddp_setup(rank: int, world_size: int, port: int): + """ + Args: + rank: Unique identifier of each process + world_size: Total number of processes + port: Port number + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(port) + torch.distributed.init_process_group( + backend="nccl", rank=rank, world_size=world_size + ) + + +def force_cudnn_initialization(device_id): + dev = torch.device(f"cuda:{device_id}") + torch.nn.functional.conv2d( + torch.zeros(32, 32, 32, 32, device=dev), torch.zeros(32, 32, 32, 32, device=dev) + ) + + +def load_ds_from_cfg(cfg: mmcv.utils.config.ConfigDict): + creds = cfg.get("deeplake_credentials", {}) + token = creds.get("token", None) + deeplake_commit = cfg.get("deeplake_commit") + deeplake_tag_id = cfg.get("deeplake_tag_id") + deeplake_query = cfg.get("deeplake_query") + token = token or os.environ.get(DEEPLAKE_AUTH_TOKEN) + if token is None: + raise EmptyTokenException() + + try: + ds = dp.open_read_only(cfg.deeplake_path, token=token, creds=creds) + except: + if not deeplake_query: + raise + ds = dp.query(deeplake_query) + + if deeplake_tag_id and deeplake_query: + raise Exception( + "A query and view_id were specified simultaneously for a dataset in the config. Please specify either the deeplake_query or the deeplake_tag_id." + ) + + if deeplake_commit: + ds.checkout(deeplake_commit) + + if deeplake_tag_id: + ds = ds.tags(deeplake_tag_id).open() + + if deeplake_query: + ds = ds.query(deeplake_query) + + return ds + + +def get_collect_keys(cfg): + pipeline = cfg.train_pipeline + for transform in pipeline: + if transform["type"] == "Collect": + return transform["keys"] + raise ValueError("collection keys were not specified") + + +def check_persistent_workers(train_persistent_workers, val_persistent_workers): + if train_persistent_workers != val_persistent_workers: + if train_persistent_workers: + always_warn( + "persistent workers for training and evaluation should be identical, " + "otherwise, this could lead to performance issues. " + "Either both of then should be `True` or both of them should `False`. " + "If you want to use persistent workers set True for validation" + ) + else: + always_warn( + "persistent workers for training and evaluation should be identical, " + "otherwise, this could lead to performance issues. " + "Either both of then should be `True` or both of them should `False`. " + "If you want to use persistent workers set True for training" + ) + + +def find_image_tensor(ds: dp.Dataset, mm_class=None): + images = [ + col.name + for col in ds.schema.columns + if ds.schema[col.name].dtype.is_image + ] + if mm_class is not None: + always_warn( + f"No deeplake column name specified for '{mm_class} in config. Fetching it using type_kind '{TypeKind.Image}'." + ) + if not images: + always_warn(f"No column found with type_kind='{TypeKind.Image}'") + return None + t = images[0] + if len(images) > 1: + always_warn( + f"Multiple columns with type_kind='{TypeKind.Image}' found. choosing '{t}'." + ) + print(f"columns {images} kind {TypeKind.Image} mm_class {mm_class} t {t}") + return t + + +def find_smask_tensor(ds: dp.Dataset, mm_class=None): + smasks = [ + col.name + for col in ds.schema.columns + if ds.schema[col.name].dtype.is_segment_mask + ] + if mm_class is not None: + always_warn( + f"No deeplake column name specified for '{mm_class} in config. Fetching it using type_kind '{TypeKind.SegmentMask}'." + ) + if not smasks: + always_warn(f"No column found with type_kind='{TypeKind.SegmentMask}'") + return None + t = smasks[0] + if len(smasks) > 1: + always_warn( + f"Multiple columns with type_kind='{TypeKind.SegmentMask}' found. choosing '{t}'." + ) + print(f"columns {smasks} kind {TypeKind.SegmentMask} mm_class {mm_class} t {t}") + return t + + +def find_tensor_with_htype(ds: dp.Dataset, type_kind=TypeKind.Image, mm_class=None): + colunms = [col.name for col in ds.schema.columns if col.dtype.kind == type_kind] + if mm_class is not None: + always_warn( + f"No deeplake column name specified for '{mm_class} in config. Fetching it using type_kind '{type_kind}'." + ) + if not colunms: + always_warn(f"No column found with type_kind='{type_kind}'") + return None + t = colunms[0] + if len(colunms) > 1: + always_warn( + f"Multiple columns with type_kind='{type_kind}' found. choosing '{t}'." + ) + + print(f"columns {colunms} kind {type_kind} mm_class {mm_class} t {t}") + return t + + +def check_unsupported_functionalities(cfg): + check_unused_dataset_fields(cfg) + check_unsupported_train_pipeline_fields(cfg, mode="train") + check_unsupported_train_pipeline_fields(cfg, mode="val") + check_dataset_augmentation_formats(cfg) + + +def check_unused_dataset_fields(cfg): + if cfg.get("dataset_type"): + always_warn( + "The deeplake mmdet integration does not use dataset_type to work with the data and compute metrics. All deeplake datasets are in the same deeplake format. To specify a metrics format, you should deeplake_metrics_format " + ) + + if cfg.get("data_root"): + always_warn( + "The deeplake mmdet integration does not use data_root, this input will be ignored" + ) + + +def check_unsupported_train_pipeline_fields(cfg, mode="train"): + transforms = cfg.data[mode].pipeline + + for transform in transforms: + transform_type = transform.get("type") + + if transform_type == "LoadImageFromFile": + always_warn( + "LoadImageFromFile is going to be skipped because deeplake mmdet integration does not use it" + ) + + if transform_type == "LoadAnnotations": + always_warn( + "LoadAnnotations is going to be skipped because deeplake mmdet integration does not use it" + ) + + if transform_type == "Corrupt": + raise Exception("Corrupt augmentation is not supported yet.") + + elif transform_type == "CopyPaste": # TO DO: @adolkhan resolve this + raise Exception("CopyPaste augmentation is not supported yet") + + elif transform_type == "CutOut": # TO DO: @adolkhan resolve this + raise Exception("CutOut augmentation is not supported yet") + + elif transform_type == "Mosaic": # TO DO: @adolkhan resolve this + raise Exception("Mosaic augmentation is not supported yet") + + +def check_dataset_augmentation_formats(cfg): + if cfg.get("train_dataset"): + always_warn( + "train_dataset is going to be unused. Dataset types like: ConcatDataset, RepeatDataset, ClassBalancedDataset, MultiImageMixDataset are not supported." + ) + + +def get_pipeline(cfg, *, name: str, generic_name: str): + pipeline = cfg.data[name].get("pipeline", None) + if pipeline is None: + warnings.warn( + f"Warning: The '{name}' data pipeline is missing in the configuration. Attempting to locate in '{generic_name}'." + ) + + pipeline = cfg.get(generic_name, []) + + return pipeline diff --git a/python/deeplake/integrations/mm/mm_runners.py b/python/deeplake/integrations/mm/mm_runners.py new file mode 100644 index 0000000000..97b9786f10 --- /dev/null +++ b/python/deeplake/integrations/mm/mm_runners.py @@ -0,0 +1,152 @@ +import mmcv # type: ignore + +import torch +import logging +from mmcv import runner +from torch.utils.data import DataLoader + +import time +import warnings +from typing import List, Tuple, Optional +from deeplake.integrations.constants import TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING + + +def empty_cuda(): + try: + torch.cuda.empty_cache() + except Exception: + pass + return + + +@runner.RUNNERS.register_module() +class DeeplakeIterBasedRunner(runner.IterBasedRunner): + def __init__(self, **kwargs): + self.force_cleanup = kwargs.pop("force_cleanup", True) + super().__init__(**kwargs) + + def run( + self, + data_loaders: List[DataLoader], + workflow: List[Tuple[str, int]], + max_iters: Optional[int] = None, + **kwargs, + ) -> None: + assert isinstance(data_loaders, list) + assert mmcv.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + if max_iters is not None: + warnings.warn( + "setting max_iters in run is deprecated, " + "please set max_iters in runner_config", + DeprecationWarning, + ) + self._max_iters = max_iters + assert ( + self._max_iters is not None + ), "max_iters must be specified during instantiation" + + work_dir = self.work_dir if self.work_dir is not None else "NONE" + self.logger.info( + "Start running, host: %s, work_dir: %s", + runner.utils.get_host_info(), + work_dir, + ) + self.logger.info( + "Hooks will be executed in the following order:\n%s", self.get_hook_info() + ) + self.logger.info("workflow: %s, max: %d iters", workflow, self._max_iters) + self.call_hook("before_run") + + iter_loaders = [runner.IterLoader(x) for x in data_loaders] + + self.call_hook("before_epoch") + + formatter = logging.Formatter("%(relative)ss") + start_time = time.time() + + while self.iter < self._max_iters: + for i, flow in enumerate(workflow): + self._inner_iter = 0 + mode, iters = flow + if not isinstance(mode, str) or not hasattr(self, mode): + raise ValueError( + 'runner has no method named "{}" to run a workflow'.format(mode) + ) + iter_runner = getattr(self, mode) + for _ in range(iters): + if mode == "train" and self.iter >= self._max_iters: + break + + iter_time = time.time() + + if ( + self.force_cleanup + and iter_time - start_time + > TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING + ): + empty_cuda() + start_time = iter_time + iter_runner(iter_loaders[i], **kwargs) + + time.sleep(1) # wait for some hooks like loggers to finish + self.call_hook("after_epoch") + self.call_hook("after_run") + + +@runner.RUNNERS.register_module() +class DeeplakeEpochBasedRunner(runner.EpochBasedRunner): + def __init__(self, **kwargs): + self.force_cleanup = kwargs.pop("force_cleanup", True) + super().__init__(**kwargs) + + def train(self, data_loader, **kwargs): + start_time = time.time() + self.model.train() + self.mode = "train" + self.data_loader = data_loader + self._max_iters = self._max_epochs * len(self.data_loader) + self.call_hook("before_train_epoch") + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self.data_batch = data_batch + self._inner_iter = i + self.call_hook("before_train_iter") + self.run_iter(data_batch, train_mode=True, **kwargs) + self.call_hook("after_train_iter") + del self.data_batch + self._iter += 1 + iter_time = time.time() + if ( + self.force_cleanup + and iter_time - start_time > TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING + ): + empty_cuda() + start_time = iter_time + + self.call_hook("after_train_epoch") + self._epoch += 1 + + @torch.no_grad() + def val(self, data_loader, **kwargs): + start_time = time.time() + self.model.eval() + self.mode = "val" + self.data_loader = data_loader + self.call_hook("before_val_epoch") + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self.data_batch = data_batch + self._inner_iter = i + self.call_hook("before_val_iter") + self.run_iter(data_batch, train_mode=False) + self.call_hook("after_val_iter") + del self.data_batch + iter_time = time.time() + if ( + self.force_cleanup + and iter_time - start_time > TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING + ): + empty_cuda() + start_time = iter_time + self.call_hook("after_val_epoch") diff --git a/python/deeplake/integrations/mm/upcast_array.py b/python/deeplake/integrations/mm/upcast_array.py new file mode 100644 index 0000000000..8f94dcdb32 --- /dev/null +++ b/python/deeplake/integrations/mm/upcast_array.py @@ -0,0 +1,15 @@ +import numpy as np +from typing import Union + + +def upcast_array(arr: Union[np.ndarray, bytes]): + if isinstance(arr, list): + return [upcast_array(a) for a in arr] + if isinstance(arr, np.ndarray): + if arr.dtype == np.uint16: + return arr.astype(np.int32) + if arr.dtype == np.uint32: + return arr.astype(np.int64) + if arr.dtype == np.uint64: + return arr.astype(np.int64) + return arr diff --git a/python/deeplake/integrations/mm/warnings.py b/python/deeplake/integrations/mm/warnings.py new file mode 100644 index 0000000000..fc2193d487 --- /dev/null +++ b/python/deeplake/integrations/mm/warnings.py @@ -0,0 +1,7 @@ +import warnings + + +def always_warn(*args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("always") + warnings.warn(*args, **kwargs) diff --git a/python/deeplake/integrations/mm/worker_init_fn.py b/python/deeplake/integrations/mm/worker_init_fn.py new file mode 100644 index 0000000000..43c4282d80 --- /dev/null +++ b/python/deeplake/integrations/mm/worker_init_fn.py @@ -0,0 +1,21 @@ +import numpy as np +import torch +import random + + +def worker_init_fn(worker_id, num_workers, rank, seed): + """Worker init func for dataloader. + + The seed of each worker equals to num_worker * rank + worker_id + user_seed + + Args: + worker_id (int): Worker id. + num_workers (int): Number of workers. + rank (int): The rank of current process. + seed (int): The random seed to use. + """ + + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) + torch.manual_seed(worker_seed) diff --git a/python/deeplake/integrations/mmdet/__init__.py b/python/deeplake/integrations/mmdet/__init__.py new file mode 100644 index 0000000000..3731638a14 --- /dev/null +++ b/python/deeplake/integrations/mmdet/__init__.py @@ -0,0 +1,2 @@ +from deeplake.integrations.mmdet.mmdet_ import train_detector +from mmdet.models import build_detector # type: ignore diff --git a/python/deeplake/integrations/mmdet/mmdet_.py b/python/deeplake/integrations/mmdet/mmdet_.py new file mode 100644 index 0000000000..32a9e79573 --- /dev/null +++ b/python/deeplake/integrations/mmdet/mmdet_.py @@ -0,0 +1,813 @@ +""" +Deep Lake offers an integration with MMDetection, a popular open-source object detection toolbox based on PyTorch. +The integration enables users to train models while streaming Deep Lake dataset using the transformation, training, and evaluation tools built by MMDet. + +Learn more about MMDetection `here `_. + +Integration Interface +~~~~~~~~~~~~~~~~~~~~~ +MMDetection works with configs. Deeplake adopted this strategy, and in order to train MMDet models, you need to create/specify your model +and training/validation config. Deep Lake integration's logic is almost the same as MMDetection's with some minor modifications. The integrations +with MMDET occurs in the deeplake.integrations.mmdet module. At a high-level, Deep Lake is responsible for the pytorch dataloader that streams data +to the training framework, while MMDET is used for the training, transformation, and evaluation logic. Let us take a look at the config with deeplake changes: + +Deeplake integration requires the following parameters to be specified in the configuration file: + +- ``data``: Just like in the MMDetection configuration files, in data dictionary you can specify everything that you want to be applied to the data during training and validation + - ``train``: Keyword argument of data, a dictionary where one can specify dataset path, credentials, transformations of the training data + - ``val``: Keyword argument of data, a dictionary where one can specify dataset path, credentials, transformations of the validation data + - ``pipeline``: List of transformations. This parameter exists for train as well as for val. + + - Example: + + >>> pipeline = [dict(type="Resize", img_scale=[(320, 320), (608, 608)], keep_ratio=True), dict(type="RandomFlip", flip_ratio=0.5), dict(type="PhotoMetricDistortion")] + + - ``deeplake_path``: Path to the deeplake dataset. This parameter exists for train as well as for val. + - ``deeplake_credentials``: Optional parameter. Required only when using private nonlocal datasets. See documendataion for `deeplake.open_read_only() https://docs.deeplake.ai/latest/api/dataset/#deeplake.open_read_only`_ for details. This parameter exists for train as well as for val. + - ``deeplake_tag_id``: Optional parameter. If specified, the dataset will checkout to the tag. This parameter exists for train as well as for val. See documentation for `Dataset.commit_id `_ + - ``deeplake_query``: Optional paramterer. If specified, the dataset can be loaded from the query is dataset_path was not been specified as well as the applied on that dataset of dataset_path was specified before + - ``deeplake_tensors``: Optional parameter. If specified maps MMDetection tensors to the associated tensors in the dataset. MMDet tensors are: "img", "gt_bboxes", "gt_labels", "gt_masks". This parameter exists for train as well as for val. + - ``"img"``: Stands for image tensor. + - ``"gt_bboxes"``: Stands for bounding box tensor. + - ``"gt_labels"``: Stands for labels tensor. + - ``"gt_masks"``: Stands for masks tensor. + + - ``deeplake_dataloader``: Optional parameter. If specified represents the parameters of the deeplake dataloader. Deeplake dataloader parameters are: "shuffle", "batch_size", "num_workers". This parameter exists for train as well as for val. + - ``"shuffle"``: If ``True`` shuffles the dataset. + - ``"batch_size"``: Size of batch. If not specified, dataloader will use ``samples_per_gpu``. + - ``"num_workers"``: Number of workers to use. If not specified, dataloader will use ``workers_per_gpu``. + +- ``deeplake_metrics_format``: Optional parameter. If specified, it represents the format of the deeplake metrics that will be used during evaluation. Defaults to COCO. + Avaliable values are: "COCO", "PascalVOC". If COCO format is used, you can specify whether you want to evaluate on bbox only or also want to evaluate on masks. + To do that you need to specify the format of the metric in metric. + +Example: + +>>> deeplake_metrics_format = "COCO" +>>> evaluation = dict(metric=["bbox"], interval=1) + +- ``train_detector``: Function to train the MMDetection model. + + Parameters: + + - ``model``: MMDetection model that is going to be used. + - ``cfg``: mmcv.ConfigDict, Configuration of the model as well as of the datasets and transforms that's going to be used. + - ``ds_train``: Optional parameter. If provided will overwrite deeplake_path in train, and will pass this tensor directly to the dataloader. + - ``ds_val``: Optional parameter. If provided will overwrite deeplake_path in val, and will pass this tensor directly to the dataloader. + - ``ds_train_tensors``: Optional parameter. If provided will overwrite deeplake_tensors in train, and will pass this tensor mapping directly to dataloader. + - ``ds_val_tensors``: Optional parameter. If provided will overwrite deeplake_tensors in val, and will pass this tensor mapping directly to dataloader. + - ``distributed``: Optional parameter. If provided will run the code on all available gpus. Meta data used to build runner. + - ``timestamp``: Variable used in runner to make .log and .log.json filenames the same. + - ``validate``: Bool, whether validation should be run, defaults to ``True``. + +NOTE: + ``gt_masks`` is optional parameter and lets say you want to train pure detector this part is going to exclude. Other mappings are mandatory + if you don't specify them explicitly they are going to be searched in the dataset according to tensor htype. Better to specify them explicitly. + +MMDetection Config Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Below is the example of the deeplake mmdet configuration: + + +>>> _base_ = "../mmdetection/configs/yolo/yolov3_d53_mstrain-416_273e_coco.py" +>>> # use caffe img_norm +>>> img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +>>> train_pipeline = [ +... dict(type='LoadImageFromFile'), +... dict(type='LoadAnnotations', with_bbox=True), +... dict( +... type='Expand', +... mean=img_norm_cfg['mean'], +... to_rgb=img_norm_cfg['to_rgb'], +... ratio_range=(1, 2)), +... dict(type='Resize', img_scale=[(320, 320), (416, 416)], keep_ratio=True), +... dict(type='RandomFlip', flip_ratio=0.0), +... dict(type='PhotoMetricDistortion'), +... dict(type='Normalize', **img_norm_cfg), +... dict(type='Pad', size_divisor=32), +... dict(type='DefaultFormatBundle'), +... dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +... ] +>>> test_pipeline = [ +... dict(type='LoadImageFromFile'), +... dict( +... type='MultiScaleFlipAug', +... img_scale=(416, 416), +... flip=False, +... transforms=[ +... dict(type='Resize', keep_ratio=True), +... dict(type='RandomFlip', flip_ratio=0.0), +... dict(type='Normalize', **img_norm_cfg), +... dict(type='Pad', size_divisor=32), +... dict(type='ImageToTensor', keys=['img']), +... dict(type='Collect', keys=['img']) +... ]) +... ] +>>> #--------------------------------------DEEPLAKE INPUTS------------------------------------------------------------# +>>> TOKEN = "INSERT_YOUR_DEEPLAKE_TOKEN" +>>> data = dict( +... # samples_per_gpu=4, # Is used instead of batch_size if deeplake_dataloader is not specified below +... # workers_per_gpu=8, # Is used instead of num_workers if deeplake_dataloader is not specified below +... train=dict( +... pipeline=train_pipeline, +... # Credentials for authentication. See documendataion for deeplake.open() for details +... deeplake_path="al://activeloop/coco-train", +... deeplake_credentials={ +... "token": TOKEN, +... "creds": None, +... }, +... #OPTIONAL - Checkout the specified commit_id before training +... deeplake_commit_id="", +... #OPTIONAL - Loads a dataset tag for training based on tag_id +... deeplake_tag_id="", +... # OPTIONAL - {"mmdet_key": "deep_lake_tensor",...} - Maps Deep Lake tensors to MMDET dictionary keys. +... # If not specified, Deep Lake will auto-infer the mapping, but it might make mistakes if datasets have many tensors +... deeplake_tensors = {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories", "gt_masks": "masks}, +... # OPTIONAL - Parameters to use for the Deep Lake dataloader. If unspecified, the integration uses +... # the parameters in other parts of the cfg file such as samples_per_gpu, and others. +... deeplake_dataloader = {"shuffle": True, "batch_size": 4, 'num_workers': 8} +... ), +... # Parameters as the same as for train +... val=dict( +... pipeline=test_pipeline, +... deeplake_path="al://activeloop/coco-val", +... deeplake_credentials={ +... "token": TOKEN, +... "creds": None, +... }, +... deeplake_tensors = {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}, +... deeplake_dataloader = {"shuffle": False, "batch_size": 1, 'num_workers': 8} +... ), +... ) +>>> # Which dataloader to use +>>> # Which metrics to use for evaulation. In MMDET (without Deeplake), this is inferred from the dataset type. +>>> # In the Deep Lake integration, since the format is standardized, a variety of metrics can be used for a given dataset. +>>> deeplake_metrics_format = "COCO" +>>> #----------------------------------END DEEPLAKE INPUTS------------------------------------------------------------# + +And config for training: + +>>> import os +>>> from mmcv import Config +>>> import mmcv +>>> from deeplake.integrations import mmdet as mmdet_deeplake +>>> cfg = Config.fromfile(cfg_file) +>>> cfg.model.bbox_head.num_classes = num_classes +>>> # Build the detector +>>> model = mmdet_deeplake.build_detector(cfg.model) +>>> # Create work_dir +>>> mmcv.mkdir_or_exist(os.path.abspath(cfg.work_dir)) +>>> # Run the training +>>> mmdet_deeplake.train_detector(model, cfg, distributed=args.distributed, validate=args.validate) +""" + +from collections import OrderedDict + +from typing import Callable, Optional, List, Dict, Sequence + +from functools import partial + +import os +import math +import types +import torch +import warnings +import tempfile +import numpy as np +import os.path as osp + +from PIL import Image, ImageDraw # type: ignore + +from terminaltables import AsciiTable # type: ignore + +try: + from mmdet.apis.train import auto_scale_lr # type: ignore +except Exception: + import mmdet # type: ignore + + version = mmdet.__version__ + raise Exception( + f"MMDet {version} version is not supported. The latest supported MMDet version with deeplake is 2.28.1." + ) +from mmdet.utils import ( # type: ignore + build_dp, + compat_cfg, + find_latest_checkpoint, + get_root_logger, +) +from mmdet.core import DistEvalHook, EvalHook # type: ignore +from mmdet.core import build_optimizer + +from mmdet.datasets import replace_ImageToTensor # type: ignore + +from mmdet.datasets.builder import PIPELINES # type: ignore +from mmdet.datasets.pipelines import Compose # type: ignore +from mmdet.core import BitmapMasks # type: ignore +from mmdet.core import eval_map, eval_recalls +from mmdet.utils.util_distribution import * # type: ignore +from mmdet.core import BitmapMasks, PolygonMasks + +import mmcv # type: ignore +from mmcv.runner import init_dist # type: ignore +from mmcv.parallel import collate # type: ignore +from mmcv.utils import build_from_cfg, digit_version # type: ignore +from mmcv.utils import print_log +from mmcv.runner import ( # type: ignore + DistSamplerSeedHook, + EpochBasedRunner, + Fp16OptimizerHook, + OptimizerHook, + build_runner, + get_dist_info, +) + +import deeplake as dp +from deeplake.types import TypeKind +from deeplake.integrations.mm.exceptions import ValidationDatasetMissingError + +from deeplake.integrations.mmdet.mmdet_dataset_ import ( + MMDetTorchDataset, + MMDetDataset, + transform, +) +from deeplake.integrations.mm.ipc import _get_free_port +from deeplake.integrations.mm.warnings import always_warn +from deeplake.integrations.mm.get_indexes import get_indexes +from deeplake.integrations.mm.upcast_array import upcast_array +from deeplake.integrations.mm.worker_init_fn import worker_init_fn +from deeplake.integrations.mm.mm_runners import DeeplakeIterBasedRunner +from deeplake.integrations.mm.mm_common import ( + load_ds_from_cfg, + get_collect_keys, + check_persistent_workers, + find_tensor_with_htype, + find_image_tensor, + ddp_setup, + force_cudnn_initialization, + check_unsupported_functionalities, + get_pipeline, +) + +from torch.utils.data import DataLoader + +# Monkey-patch the function +from deeplake.integrations.mmdet.test_ import single_gpu_test as custom_single_gpu_test +from deeplake.integrations.mmdet.test_ import multi_gpu_test as custom_multi_gpu_test + +import mmdet.apis + +mmdet.apis.single_gpu_test = custom_single_gpu_test +mmdet.apis.multi_gpu_test = custom_multi_gpu_test + + +def build_ddp(model, device, *args, **kwargs): + """Build DistributedDataParallel module by device type. + + If device is cuda, return a MMDistributedDataParallel model; + if device is mlu, return a MLUDistributedDataParallel model. + + Args: + model (:class:`nn.Module`): module to be parallelized. + device (str): device type, mlu or cuda. + args (List): arguments to be passed to ddp_factory + kwargs (dict): keyword arguments to be passed to ddp_factory + + Returns: + :class:`nn.Module`: the module to be parallelized + + References: + .. [1] https://pytorch.org/docs/stable/generated/torch.nn.parallel. + DistributedDataParallel.html + """ + + assert device in ["cuda", "mlu"], "Only available for cuda or mlu devices." + if device == "cuda": + model = model.cuda(kwargs["device_ids"][0]) # patch + elif device == "mlu": + from mmcv.device.mlu import MLUDistributedDataParallel # type: ignore + + ddp_factory["mlu"] = MLUDistributedDataParallel + model = model.mlu() + + return ddp_factory[device](model, *args, **kwargs) + + +def mmdet_subiterable_dataset_eval( + self, + *args, + **kwargs, +): + return self.dataset.mmdet_dataset.evaluate(*args, **kwargs) + + +def build_dataloader( + dataset: dp.Dataset, + images_tensor: str, + masks_tensor: Optional[str], + boxes_tensor: str, + labels_tensor: str, + pipeline: List, + mode: str = "train", + **loader_config, +): + poly2mask = False + if masks_tensor is not None: + if dataset.schema[masks_tensor].dtype.kind == TypeKind.Polygon: + poly2mask = True + + bbox_info = dict(dataset[boxes_tensor].metadata) + classes = dataset[labels_tensor].metadata["class_names"] + pipeline = build_pipeline(pipeline) + metrics_format = loader_config.get("metrics_format") + persistent_workers = loader_config.get("persistent_workers", False) + dist = loader_config["dist"] + seed = loader_config["seed"] + + transform_fn = partial( + transform, + images_tensor=images_tensor, + masks_tensor=masks_tensor, + boxes_tensor=boxes_tensor, + labels_tensor=labels_tensor, + pipeline=pipeline, + bbox_info=bbox_info, + poly2mask=poly2mask, + ) + + num_workers = loader_config.get("num_workers") + pin_memory = loader_config.get("pin_memory", False) + if num_workers is None: + num_workers = loader_config["workers_per_gpu"] + + shuffle = loader_config.get("shuffle", True) + tensors_dict = { + "images_tensor": images_tensor, + "boxes_tensor": boxes_tensor, + "labels_tensor": labels_tensor, + } + tensors = [images_tensor, labels_tensor, boxes_tensor] + if masks_tensor is not None: + tensors.append(masks_tensor) + tensors_dict["masks_tensor"] = masks_tensor + + batch_size = loader_config.get("batch_size") + drop_last = loader_config.get("drop_last", False) + if batch_size is None: + batch_size = loader_config["samples_per_gpu"] + + collate_fn = partial(collate, samples_per_gpu=batch_size) + + mmdet_ds = MMDetDataset( + dataset=dataset, + metrics_format=metrics_format, + pipeline=pipeline, + tensors_dict=tensors_dict, + tensors=tensors, + mode=mode, + bbox_info=bbox_info, + num_gpus=loader_config["num_gpus"], + batch_size=batch_size, + ) + + if dist: + rank, world_size = get_dist_info() + sl = get_indexes( + dataset, rank=rank, num_replicas=world_size, drop_last=drop_last + ) + dataset = dataset.query( + f"select * LIMIT {sl.stop - sl.start} OFFSET {sl.start}" + ) + + pytorch_ds = MMDetTorchDataset(dataset, transform=transform_fn) + + init_fn = ( + partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) + if seed is not None + else None + ) + + if digit_version(torch.__version__) >= digit_version("1.8.0"): + loader = DataLoader( + pytorch_ds, + batch_size=batch_size, + sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + persistent_workers=persistent_workers, + ) + else: + loader = DataLoader( + pytorch_ds, + batch_size=batch_size, + sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + ) + + loader.dataset.mmdet_dataset = mmdet_ds + loader.dataset.pipeline = loader.dataset.mmdet_dataset.pipeline + eval_fn = partial(mmdet_subiterable_dataset_eval, loader) + loader.dataset.evaluate = eval_fn + loader.dataset.CLASSES = classes + return loader + + +def build_pipeline(steps): + return Compose( + [ + build_from_cfg(step, PIPELINES, None) + for step in steps + if step["type"] not in {"LoadImageFromFile", "LoadAnnotations"} + ] + ) + + +def train_detector( + model, + cfg: mmcv.ConfigDict, + ds_train=None, + ds_train_tensors=None, + ds_val: Optional[dp.Dataset] = None, + ds_val_tensors=None, + distributed: bool = False, + timestamp=None, + meta=None, + validate: bool = True, +): + """ + Creates runner and trains evaluates the model: + Args: + model: model to train, should be built before passing + train_dataset: dataset to train of type dp.Dataset + cfg: mmcv.ConfigDict object containing all necessary configuration. + In cfg we have several changes to support deeplake integration: + _base_: still serves as a base model to inherit from + data: where everything related to data processing, you will need to specify the following parameters: + train: everything related to training data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: `{"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}`. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image", "class_label, "bbox", "segment_mask" or "polygon". + keys that needs to be mapped are: `img`, `gt_labels`, `gt_bboxes`, `gt_masks`. `img`, `gt_labels`, `gt_bboxes` are always required, if they not specified they + are always searched, while masks are optional, if you specify in collect `gt_masks` then you need to either specify it in config or it will be searched based on + `segment_mask` and `polygon` htypes. + deeplake_credentials: dictionary with deeplake credentials that allow you to acess the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal account on: https://www.activeloop.ai/. + val (Optional): everything related to validating data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image", "class_label, "bbox", "segment_mask" or "polygon". + keys that needs to be mapped are: `img`, `gt_labels`, `gt_bboxes`, `gt_masks`. `img`, `gt_labels`, `gt_bboxes` are always required, if they not specified they + are always searched, while masks are optional, if you specify in collect `gt_masks` then you need to either specify it in config or it will be searched based on + `segment_mask` and `polygon` htypes. + deeplake_credentials: deeplake credentials that allow you to acess the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal account on: https://www.activeloop.ai/. + test (Optional): everything related to testing data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image", "class_label, "bbox", "segment_mask" or "polygon". + keys that needs to be mapped are: `img`, `gt_labels`, `gt_bboxes`, `gt_masks`. `img`, `gt_labels`, `gt_bboxes` are always required, if they not specified they + are always searched, while masks are optional, if you specify in collect `gt_masks` then you need to either specify it in config or it will be searched based on + `segment_mask` and `polygon` htypes. + deeplake_credentials: deeplake credentials that allow you to acess the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal account on: https://www.activeloop.ai/. + samples_per_gpu: number of samples to be processed per gpu + workers_per_gpu: number of workers per gpu + optimizer: dictionary containing information about optimizer initialization + optimizer_config: some optimizer configuration that might be used during training like grad_clip etc. + runner: training type e.g. EpochBasedRunner, here you can specify maximum number of epcohs to be conducted. For instance: `runner = dict(type='EpochBasedRunner', max_epochs=273)` + ds_train: train dataset of type dp.Dataset. This can be a view of the dataset. + ds_train_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image", "class_label, "bbox", "segment_mask" or "polygon". + keys that needs to be mapped are: `img`, `gt_labels`, `gt_bboxes`, `gt_masks`. `img`, `gt_labels`, `gt_bboxes` are always required, if they not specified they + are always searched, while masks are optional, if you specify in collect `gt_masks` then you need to either specify it in config or it will be searched based on + `segment_mask` and `polygon` htypes. + ds_val: validation dataset of type dp.Dataset. This can be view of the dataset. + ds_val_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image", "class_label, "bbox", "segment_mask" or "polygon". + keys that needs to be mapped are: `img`, `gt_labels`, `gt_bboxes`, `gt_masks`. `img`, `gt_labels`, `gt_bboxes` are always required, if they not specified they + are always searched, while masks are optional, if you specify in collect `gt_masks` then you need to either specify it in config or it will be searched based on + `segment_mask` and `polygon` htypes. + evaluation: dictionary that contains all information needed for evaluation apart from data processing, like how often evaluation should be done and what metrics we want to use. In deeplake + integration version you also need to specify what kind of output you want to be printed during evalaution. For instance, `evaluation = dict(interval=1, metric=['bbox'], metrics_format="COCO")` + distributed: bool, whether ddp training should be started, by default `False` + timestamp: variable used in runner to make .log and .log.json filenames the same + meta: meta data used to build runner + validate: bool, whether validation should be conducted, by default `True` + """ + check_unsupported_functionalities(cfg) + + if not hasattr(cfg, "gpu_ids"): + cfg.gpu_ids = range(torch.cuda.device_count() if distributed else 1) + if distributed: + return torch.multiprocessing.spawn( + _train_detector, + args=( + model, + cfg, + ds_train, + ds_train_tensors, + ds_val, + ds_val_tensors, + distributed, + timestamp, + meta, + validate, + _get_free_port(), + ), + nprocs=len(cfg.gpu_ids), + ) + _train_detector( + 0, + model, + cfg, + ds_train, + ds_train_tensors, + ds_val, + ds_val_tensors, + distributed, + timestamp, + meta, + validate, + ) + + +def _train_detector( + local_rank, + model, + cfg: mmcv.ConfigDict, + ds_train=None, + ds_train_tensors=None, + ds_val: Optional[dp.Dataset] = None, + ds_val_tensors=None, + distributed: bool = False, + timestamp=None, + meta=None, + validate: bool = True, + port=None, +): + batch_size = cfg.data.get("samples_per_gpu", 256) + num_workers = cfg.data.get("workers_per_gpu", 1) + + if ds_train is None: + ds_train = load_ds_from_cfg(cfg.data.train) + ds_train_tensors = cfg.data.train.get("deeplake_tensors", {}) + else: + cfg_data = cfg.data.train.get("deeplake_path") + if cfg_data: + always_warn( + "A Deep Lake dataset was specified in the cfg as well as inthe dataset input to train_detector. The dataset input to train_detector will be used in the workflow." + ) + + eval_cfg = cfg.get("evaluation", {}) + if ds_train_tensors: + train_images_tensor = ds_train_tensors["img"] + train_boxes_tensor = ds_train_tensors["gt_bboxes"] + train_labels_tensor = ds_train_tensors["gt_labels"] + train_masks_tensor = ds_train_tensors.get("gt_masks") + else: + train_images_tensor = find_image_tensor(ds_train, mm_class="img") + train_boxes_tensor = find_tensor_with_htype( + ds_train, type_kind=TypeKind.BoundingBox, mm_class="gt_bboxes" + ) + train_labels_tensor = find_tensor_with_htype( + ds_train, type_kind=TypeKind.ClassLabel, mm_class="train gt_labels" + ) + train_masks_tensor = None + + collection_keys = get_collect_keys(cfg) + if "gt_masks" in collection_keys: + train_masks_tensor = find_tensor_with_htype( + ds_train, type_kind=TypeKind.BinaryMask, mm_class="gt_masks" + ) or find_tensor_with_htype( + ds_train, type_kind=TypeKind.Polygon, mm_class="gt_masks" + ) + + # TODO verify required tensors are not None and raise Exception. + if hasattr(model, "CLASSES"): + warnings.warn( + "model already has a CLASSES attribute. dataset.info.class_names will not be used." + ) + elif "class_names" in dict(ds_train[train_labels_tensor].metadata): + model.CLASSES = ds_train[train_labels_tensor].metadata["class_names"] + + metrics_format = cfg.get("deeplake_metrics_format", "COCO") + + logger = get_root_logger(log_level=cfg.log_level) + + runner_type = "EpochBasedRunner" if "runner" not in cfg else cfg.runner["type"] + + train_dataloader_default_args = dict( + samples_per_gpu=batch_size, + workers_per_gpu=num_workers, + # `num_gpus` will be ignored if distributed + num_gpus=len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + runner_type=runner_type, + metrics_format=metrics_format, + ) + + train_loader_cfg = { + **train_dataloader_default_args, + **cfg.data.get("train_dataloader", {}), + **cfg.data.train.get("deeplake_dataloader", {}), + } + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get("find_unused_parameters", False) + # Sets the `find_unused_parameters` parameter in + # # torch.nn.parallel.DistributedDataParallel + # model = torch.nn.parallel.DistributedDataParallel(model.cuda(), + # device_ids=[local_rank], + # output_device=local_rank, + # broadcast_buffers=False, + # find_unused_parameters=find_unused_parameters) + force_cudnn_initialization(cfg.gpu_ids[local_rank]) + ddp_setup(local_rank, len(cfg.gpu_ids), port) + model = build_ddp( + model, + cfg.device, + device_ids=[cfg.gpu_ids[local_rank]], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters, + ) + else: + model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) + + train_pipeline = get_pipeline(cfg, name="train", generic_name="train_pipeline") + + data_loader = build_dataloader( + ds_train, # TO DO: convert it to for loop if we will suport concatting several datasets + train_images_tensor, + train_masks_tensor, + train_boxes_tensor, + train_labels_tensor, + pipeline=train_pipeline, + **train_loader_cfg, + ) + # build optimizer + auto_scale_lr(cfg, distributed, logger) + optimizer = build_optimizer(model, cfg.optimizer) + + cfg.custom_imports = dict( + imports=["deeplake.integrations.mm.mm_runners"], + allow_failed_imports=False, + ) + if cfg.runner.type == "IterBasedRunner": + cfg.runner.type = "DeeplakeIterBasedRunner" + elif cfg.runner.type == "EpochBasedRunner": + cfg.runner.type = "DeeplakeEpochBasedRunner" + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta, + force_cleanup=True, + ), + ) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get("fp16", None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed + ) + elif distributed and "type" not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks( + cfg.lr_config, + optimizer_config, + cfg.checkpoint_config, + cfg.log_config, + cfg.get("momentum_config", None), + custom_hooks_config=cfg.get("custom_hooks", None), + ) + + if distributed: + if isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + val_dataloader_default_args = dict( + samples_per_gpu=batch_size, + workers_per_gpu=num_workers, + dist=distributed, + seed=cfg.seed, + shuffle=False, + mode="val", + metrics_format=metrics_format, + num_gpus=len(cfg.gpu_ids), + ) + + val_dataloader_args = { + **cfg.data.val.get("deeplake_dataloader", {}), + **val_dataloader_default_args, + } + + train_persistent_workers = train_loader_cfg.get("persistent_workers", False) + val_persistent_workers = val_dataloader_args.get("persistent_workers", False) + check_persistent_workers(train_persistent_workers, val_persistent_workers) + + if val_dataloader_args.get("shuffle", False): + always_warn("shuffle argument for validation dataset will be ignored.") + + if ds_val is None: + cfg_ds_val = cfg.data.get("val") + if cfg_ds_val is None or not any( + cfg_ds_val.get(key) is not None + for key in ["deeplake_path", "deeplake_query"] + ): + raise ValidationDatasetMissingError() + + ds_val = load_ds_from_cfg(cfg.data.val) + ds_val_tensors = cfg.data.val.get("deeplake_tensors", {}) + else: + cfg_data = cfg.data.val.get("deeplake_path") + if cfg_data is not None: + always_warn( + "A Deep Lake dataset was specified in the cfg as well as in the dataset input to train_detector. The dataset input to train_detector will be used in the workflow." + ) + + if ds_val is None: + raise ValidationDatasetMissingError() + + if val_dataloader_args["samples_per_gpu"] > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline) + + if ds_val_tensors: + val_images_tensor = ds_val_tensors["img"] + val_boxes_tensor = ds_val_tensors["gt_bboxes"] + val_labels_tensor = ds_val_tensors["gt_labels"] + val_masks_tensor = ds_val_tensors.get("gt_masks") + else: + val_images_tensor = find_image_tensor(ds_val, mm_class="img") + val_boxes_tensor = find_tensor_with_htype( + ds_val, type_kind=TypeKind.BoundingBox, mm_class="gt_bboxes" + ) + val_labels_tensor = find_tensor_with_htype( + ds_val, type_kind=TypeKind.ClassLabel, mm_class="gt_labels" + ) + val_masks_tensor = None + + collection_keys = get_collect_keys(cfg) + if "gt_masks" in collection_keys: + val_masks_tensor = find_tensor_with_htype( + ds_val, type_kind=TypeKind.BinaryMask, mm_class="gt_masks" + ) or find_tensor_with_htype( + ds_val, type_kind=TypeKind.Polygon, mm_class="gt_masks" + ) + + # TODO make sure required tensors are not None. + val_pipeline = get_pipeline(cfg, name="val", generic_name="test_pipeline") + + val_dataloader = build_dataloader( + ds_val, + val_images_tensor, + val_masks_tensor, + val_boxes_tensor, + val_labels_tensor, + pipeline=val_pipeline, + **val_dataloader_args, + ) + + eval_cfg["by_epoch"] = cfg.runner["type"] != "DeeplakeIterBasedRunner" + eval_hook = EvalHook + if distributed: + eval_hook = DistEvalHook + # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the + # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. + runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority="LOW") + + resume_from = None + if cfg.resume_from is None and cfg.get("auto_resume"): + resume_from = find_latest_checkpoint(cfg.work_dir) + if resume_from is not None: + cfg.resume_from = resume_from + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run([data_loader], cfg.workflow) diff --git a/python/deeplake/integrations/mmdet/mmdet_dataset_.py b/python/deeplake/integrations/mmdet/mmdet_dataset_.py new file mode 100644 index 0000000000..eaa4183976 --- /dev/null +++ b/python/deeplake/integrations/mmdet/mmdet_dataset_.py @@ -0,0 +1,823 @@ +from collections import OrderedDict +from typing import Callable, Optional, List, Dict, Sequence + +import os +import math +import types +import torch +import warnings +import tempfile +import numpy as np +import os.path as osp + +from PIL import Image, ImageDraw # type: ignore + +from terminaltables import AsciiTable # type: ignore + +try: + from mmdet.apis.train import auto_scale_lr # type: ignore +except Exception: + import mmdet # type: ignore + + version = mmdet.__version__ + raise Exception( + f"MMDet {version} version is not supported. The latest supported MMDet version with deeplake is 2.28.1." + ) + +from mmdet.core import eval_map, eval_recalls +from mmdet.core import BitmapMasks, PolygonMasks + +import mmcv # type: ignore +from mmcv.utils import print_log + +import deeplake as dp +from deeplake.types import TypeKind + +from deeplake.integrations.mm.upcast_array import upcast_array +from deeplake.integrations.mm.warnings import always_warn +from deeplake.integrations.mmdet import mmdet_utils_ + +from torch.utils.data import DataLoader + +# Monkey-patch the function +from deeplake.integrations.mm.exceptions import InvalidImageError +from deeplake.integrations.mmdet.test_ import single_gpu_test as custom_single_gpu_test +from deeplake.integrations.mmdet.test_ import multi_gpu_test as custom_multi_gpu_test + +from torch.utils.data import Dataset + + +def coco_pixel_2_pascal_pixel(boxes, shape): + """ + Converts bounding boxes from COCO pixel format (x, y, width, height) + to Pascal VOC pixel format (x_min, y_min, x_max, y_max). + + Clipping ensures the bounding boxes have non-negative width and height. + + @param boxes: numpy array of shape (N, 4), containing bounding boxes in COCO format. + @param shape: tuple, the shape of the image (height, width). + + @return: numpy array of shape (N, 4), bounding boxes in Pascal VOC format. + """ + pascal_boxes = np.empty((0, 4), dtype=boxes.dtype) + if boxes.size != 0: + pascal_boxes = np.stack( + ( + boxes[:, 0], + boxes[:, 1], + boxes[:, 0] + boxes[:, 2], + boxes[:, 1] + boxes[:, 3], + ), + axis=1, + ) + return pascal_boxes + + +def poly_2_mask(polygons, shape): + # TODO This doesnt fill the array inplace. out = np.zeros(shape + (len(polygons),), dtype=np.uint8) + """ + Converts a list of polygons into a binary mask. + + @param polygons: list of polygons, where each polygon is a list of (x, y) coordinates. + @param shape: tuple, the shape of the mask (height, width). + + @return: numpy array, binary mask of the same size as the image. + """ + out = np.zeros(shape + (len(polygons),), dtype=np.uint8) + for i, polygon in enumerate(polygons): + im = Image.fromarray(out[..., i]) + d = ImageDraw.Draw(im) + d.polygon(polygon, fill=1) + out[..., i] = np.asarray(im) + return out + + +def coco_frac_2_pascal_pixel(boxes, shape): + """ + Converts bounding boxes from fractional COCO format (relative to image size) + to Pascal VOC pixel format. + + @param boxes: numpy array of shape (N, 4), bounding boxes in fractional COCO format. + @param shape: tuple, the shape of the image (height, width). + + @return: numpy array of shape (N, 4), bounding boxes in Pascal VOC format. + """ + bbox = np.empty((0, 4), dtype=boxes.dtype) + if boxes.size != 0: + x = boxes[:, 0] * shape[1] + y = boxes[:, 1] * shape[0] + w = boxes[:, 2] * shape[1] + h = boxes[:, 3] * shape[0] + bbox = np.stack((x, y, w, h), axis=1) + return coco_pixel_2_pascal_pixel(bbox, shape) + + +def pascal_frac_2_pascal_pixel(boxes, shape): + """ + Converts bounding boxes from fractional Pascal VOC format (LTRB) + to pixel Pascal VOC format. + + @param boxes: numpy array of shape (N, 4), bounding boxes in fractional format. + @param shape: tuple, the shape of the image (height, width). + + @return: numpy array of shape (N, 4), bounding boxes in pixel format. + """ + bbox = np.empty((0, 4), dtype=boxes.dtype) + if boxes.size != 0: + x_top = boxes[:, 0] * shape[1] + y_top = boxes[:, 1] * shape[0] + x_bottom = boxes[:, 2] * shape[1] + y_bottom = boxes[:, 3] * shape[0] + bbox = np.stack((x_top, y_top, x_bottom, y_bottom), axis=1) + return bbox + + +def yolo_pixel_2_pascal_pixel(boxes, shape): + """ + Converts bounding boxes from YOLO pixel format (center_x, center_y, width, height) + to Pascal VOC pixel format (LTRB). + + @param boxes: numpy array of shape (N, 4), bounding boxes in YOLO format. + @param shape: tuple, the shape of the image (height, width). + + @return: numpy array of shape (N, 4), bounding boxes in Pascal VOC format. + """ + bbox = np.empty((0, 4), dtype=boxes.dtype) + if boxes.size != 0: + x_top = np.array(boxes[:, 0]) - np.floor(np.array(boxes[:, 2]) / 2) + y_top = np.array(boxes[:, 1]) - np.floor(np.array(boxes[:, 3]) / 2) + x_bottom = np.array(boxes[:, 0]) + np.floor(np.array(boxes[:, 2]) / 2) + y_bottom = np.array(boxes[:, 1]) + np.floor(np.array(boxes[:, 3]) / 2) + bbox = np.stack((x_top, y_top, x_bottom, y_bottom), axis=1) + return bbox + + +def yolo_frac_2_pascal_pixel(boxes, shape): + """ + Converts bounding boxes from YOLO fractional format to Pascal VOC pixel format. + + @param boxes: numpy array of shape (N, 4), bounding boxes in YOLO fractional format. + @param shape: tuple, the shape of the image (height, width). + + @return: numpy array of shape (N, 4), bounding boxes in Pascal VOC format. + """ + bbox = np.empty((0, 4), dtype=boxes.dtype) + if boxes.size != 0: + x_center = boxes[:, 0] * shape[1] + y_center = boxes[:, 1] * shape[0] + width = boxes[:, 2] * shape[1] + height = boxes[:, 3] * shape[0] + bbox = np.stack((x_center, y_center, width, height), axis=1) + return yolo_pixel_2_pascal_pixel(bbox, shape) + + +def get_bbox_format(bbox, bbox_info): + bbox_info = bbox_info.get("coords") + if not bbox_info: + bbox_info = {} + mode = bbox_info.get("mode", "LTWH") + type = bbox_info.get("type", "pixel") + + if len(bbox_info) == 0 and np.mean(bbox) < 1: + mode = "CCWH" + type = "fractional" + return (mode, type) + + +BBOX_FORMAT_TO_PASCAL_CONVERTER = { + ("LTWH", "pixel"): coco_pixel_2_pascal_pixel, + ("LTWH", "fractional"): coco_frac_2_pascal_pixel, + ("LTRB", "pixel"): lambda x, y: x, + ("LTRB", "fractional"): pascal_frac_2_pascal_pixel, + ("CCWH", "pixel"): yolo_pixel_2_pascal_pixel, + ("CCWH", "fractional"): yolo_frac_2_pascal_pixel, +} + + +def convert_to_pascal_format(bbox, bbox_info, shape): + bbox_format = get_bbox_format(bbox, bbox_info) + converter = BBOX_FORMAT_TO_PASCAL_CONVERTER[bbox_format] + return converter(bbox, shape) + + +def pascal_pixel_2_coco_pixel(boxes, images): + """ + Converts bounding boxes from Pascal VOC pixel format (LTRB) + to COCO pixel format (x, y, width, height). + + @param boxes: numpy array of images (N, 4), bounding boxes in Pascal VOC format. + @param images: tuple, the images of the image (height, width). + + @return: numpy array of images (N, 4), bounding boxes in COCO pixel format. + """ + pascal_boxes = [] + for box in boxes: + if box.size != 0: + pascal_boxes.append( + np.stack( + ( + box[:, 0], + box[:, 1], + box[:, 2] - box[:, 0], + box[:, 3] - box[:, 1], + ), + axis=1, + ) + ) + else: + pascal_boxes.append(box) + return pascal_boxes + + +def pascal_frac_2_coco_pixel(boxes, images): + pascal_pixel_boxes = [] + for i, box in enumerate(boxes): + if box.size != 0: + shape = images[i].shape + x_top = box[:, 0] * shape[1] + y_top = box[:, 1] * shape[0] + x_bottom = box[:, 2] * shape[1] + y_bottom = box[:, 3] * shape[0] + bbox = np.stack((x_top, y_top, x_bottom, y_bottom), axis=1) + pascal_pixel_boxes.append(bbox) + return pascal_pixel_2_coco_pixel(pascal_pixel_boxes, images) + + +def yolo_pixel_2_coco_pixel(boxes, images): + yolo_boxes = [] + for box in boxes: + if box.size != 0: + x_top = np.array(box[:, 0]) - np.floor(np.array(box[:, 2]) / 2) + y_top = np.array(box[:, 1]) - np.floor(np.array(box[:, 3]) / 2) + w = box[:, 2] + h = box[:, 3] + bbox = np.stack([x_top, y_top, w, h], axis=1) + yolo_boxes.append(bbox) + return yolo_boxes + + +def yolo_frac_2_coco_pixel(boxes, images): + yolo_boxes = [] + for i, box in enumerate(boxes): + shape = images[i].shape + x_center = box[:, 0] * shape[1] + y_center = box[:, 1] * shape[0] + width = box[:, 2] * shape[1] + height = box[:, 3] * shape[0] + bbox = np.stack((x_center, y_center, width, height), axis=1) + yolo_boxes.append(bbox) + return yolo_pixel_2_coco_pixel(yolo_boxes, images) + + +def coco_frac_2_coco_pixel(boxes, images): + coco_pixel_boxes = [] + for i, box in enumerate(boxes): + shape = images[i].shape + x = box[:, 0] * shape[1] + y = box[:, 1] * shape[0] + w = box[:, 2] * shape[1] + h = box[:, 3] * shape[0] + bbox = np.stack((x, y, w, h), axis=1) + coco_pixel_boxes.append(bbox) + return np.array(coco_pixel_boxes) + + +BBOX_FORMAT_TO_COCO_CONVERTER = { + ("LTWH", "pixel"): lambda x, y: x, + ("LTWH", "fractional"): coco_frac_2_coco_pixel, + ("LTRB", "pixel"): pascal_pixel_2_coco_pixel, + ("LTRB", "fractional"): pascal_frac_2_coco_pixel, + ("CCWH", "pixel"): yolo_pixel_2_coco_pixel, + ("CCWH", "fractional"): yolo_frac_2_coco_pixel, +} + + +def convert_to_coco_format(bbox, bbox_format, images): + converter = BBOX_FORMAT_TO_COCO_CONVERTER[bbox_format] + return converter(bbox, images) + + +def first_non_empty(bboxes): + for box in bboxes: + if len(box): + return box + raise ValueError("Empty bboxes") + + +def transform( + sample_in, + images_tensor: str, + masks_tensor: str, + boxes_tensor: str, + labels_tensor: str, + pipeline: Callable, + bbox_info: str, + poly2mask: bool, +): + img = upcast_array(sample_in[images_tensor]) + if not isinstance(img, np.ndarray): + img = np.array(img) + + bboxes = upcast_array(sample_in[boxes_tensor]) + # TODO bbox format should be recognized outside the transform, not per sample basis. + bboxes = convert_to_pascal_format(bboxes, bbox_info, img.shape) + if bboxes.shape == (0, 0): # TO DO: remove after bug will be fixed + bboxes = np.empty((0, 4), dtype=sample_in[boxes_tensor].dtype) + + labels = upcast_array(sample_in[labels_tensor]) + + if img.ndim == 2: + img = np.expand_dims(img, -1) + + img = img[..., ::-1] # rgb_to_bgr should be optional + if img.shape[2] == 1: + img = np.repeat(img, 3, axis=2) + shape = img.shape + + pipeline_dict = { + "img": np.ascontiguousarray(img, dtype=np.float32), + "img_fields": ["img"], + "filename": None, + "ori_filename": None, + "img_shape": shape, + "ori_shape": shape, + "gt_bboxes": bboxes, + "gt_labels": labels, + "bbox_fields": ["gt_bboxes"], + } + + if masks_tensor: + masks = upcast_array(sample_in[masks_tensor]) + if poly2mask: + masks = mmdet_utils_.convert_poly_to_coco_format(masks) + masks = PolygonMasks( + [process_polygons(polygons) for polygons in masks], shape[0], shape[1] + ) + else: + masks = BitmapMasks(masks.astype(np.uint8).transpose(2, 0, 1), *shape[:2]) + + pipeline_dict["gt_masks"] = masks + pipeline_dict["mask_fields"] = ["gt_masks"] + return pipeline(pipeline_dict) + + +def process_polygons(polygons): + """Convert polygons to list of ndarray and filter invalid polygons. + + Args: + polygons (list[list]): Polygons of one instance. + + Returns: + list[numpy.ndarray]: Processed polygons. + """ + + polygons = [np.array(p) for p in polygons] + valid_polygons = [] + for polygon in polygons: + if len(polygon) % 2 == 0 and len(polygon) >= 6: + valid_polygons.append(polygon) + return valid_polygons + + +class MMDetTorchDataset(Dataset): + def __init__( + self, + dataset, + tensors: Optional[Sequence[str]] = None, + transform: Optional[Callable] = None, + ) -> None: + super().__init__() + self.dataset = dataset + self.transform = transform + self.column_names = [col.name for col in self.dataset.schema.columns] + self.last_successful_index = -1 + + def __getstate__(self): + return { + "dataset": self.dataset, + "transform": self.transform, + "column_names": self.column_names, + "last_successful_index": self.last_successful_index, + } + + def __setstate__(self, state): + """Restore state from pickled state.""" + if hasattr(super(), "__setstate__"): + super().__setstate__(state) + + self.dataset = state["dataset"] + self.transform = state["transform"] + self.column_names = state["column_names"] + self.last_successful_index = state["last_successful_index"] + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + while True: + try: + sample = self.dataset[idx] + if self.transform: + return self.transform(sample) + else: + out = {} + for col in self.column_names: + out[col] = sample[col] + return out + except InvalidImageError as e: + print(f"Error processing data at index {idx}: {e}") + if self.last_successful_index == -1: + self.last_successful_index = idx + 1 + idx = self.last_successful_index + continue + + +class MMDetDataset(MMDetTorchDataset): + def __init__( + self, + *args, + tensors_dict=None, + mode="train", + metrics_format="COCO", + bbox_info=None, + pipeline=None, + num_gpus=1, + batch_size=1, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.mode = mode + self.pipeline = pipeline + self.num_gpus = num_gpus + self.batch_size = batch_size + self.tensors_dict = tensors_dict + self.bbox_info = bbox_info + if self.mode in ("val", "test"): + self.images = self._get_images(self.tensors_dict["images_tensor"]) + masks = self._get_masks(self.tensors_dict.get("masks_tensor", None)) + masks_type_kind = ( + self.dataset.schema[masks.name].dtype.kind + if masks is not None and masks != [] + else None + ) + self.masks_type_kind = masks_type_kind + self.masks = masks[:] + self.bboxes = self._get_bboxes(self.tensors_dict["boxes_tensor"]) + bbox_format = get_bbox_format(first_non_empty(self.bboxes), bbox_info) + self.labels = self._get_labels(self.tensors_dict["labels_tensor"]) + self.iscrowds = self._get_iscrowds(self.tensors_dict.get("iscrowds")) + self.CLASSES = self.get_classes(self.tensors_dict["labels_tensor"]) + self.metrics_format = metrics_format + coco_style_bbox = convert_to_coco_format( + self.bboxes, bbox_format, self.images + ) + + if self.metrics_format == "COCO": + self.evaluator = mmdet_utils_.COCODatasetEvaluater( + pipeline, + classes=self.CLASSES, + deeplake_dataset=self.dataset, + imgs=self.images, + masks=self.masks, + masks_type_kind=self.masks_type_kind, + bboxes=coco_style_bbox, + labels=self.labels, + iscrowds=self.iscrowds, + bbox_format=bbox_format, + num_gpus=num_gpus, + ) + else: + self.evaluator = None + + def __getstate__(self): + """Prepare state for pickling.""" + state = super().__getstate__() if hasattr(super(), "__getstate__") else {} + + state.update( + { + "mode": self.mode, + "pipeline": self.pipeline, + "num_gpus": self.num_gpus, + "batch_size": self.batch_size, + "tensors_dict": self.tensors_dict, + "bbox_info": self.bbox_info, + } + ) + return state + + def __setstate__(self, state): + """Restore state from pickled state.""" + if hasattr(super(), "__setstate__"): + super().__setstate__(state) + + self.mode = state["mode"] + self.pipeline = state["pipeline"] + self.num_gpus = state["num_gpus"] + self.batch_size = state["batch_size"] + self.tensors_dict = state["tensors_dict"] + self.bbox_info = state["bbox_info"] + + if self.mode in ("val", "test"): + self.images = self._get_images(self.tensors_dict["images_tensor"]) + masks = self._get_masks(self.tensors_dict.get("masks_tensor", None)) + masks_type_kind = ( + self.dataset.schema[masks.name].dtype.kind + if masks is not None and masks != [] + else None + ) + self.masks_type_kind = masks_type_kind + self.masks = masks[:] + self.bboxes = self._get_bboxes(self.tensors_dict["boxes_tensor"]) + bbox_format = get_bbox_format(first_non_empty(self.bboxes), bbox_info) + self.labels = self._get_labels(self.tensors_dict["labels_tensor"]) + self.iscrowds = self._get_iscrowds(self.tensors_dict.get("iscrowds")) + self.CLASSES = self.get_classes(self.tensors_dict["labels_tensor"]) + self.metrics_format = metrics_format + coco_style_bbox = convert_to_coco_format( + self.bboxes, bbox_format, self.images + ) + + if self.metrics_format == "COCO": + self.evaluator = mmdet_utils_.COCODatasetEvaluater( + pipeline, + classes=self.CLASSES, + deeplake_dataset=self.dataset, + imgs=self.images, + masks=self.masks, + masks_type_kind=self.masks_type_kind, + bboxes=coco_style_bbox, + labels=self.labels, + iscrowds=self.iscrowds, + bbox_format=bbox_format, + num_gpus=num_gpus, + ) + else: + self.evaluator = None + + def __len__(self): + if self.mode == "val": + per_gpu_length = math.floor( + len(self.dataset) / (self.batch_size * self.num_gpus) + ) + total_length = per_gpu_length * self.num_gpus + return total_length + return super().__len__() + + def _get_images(self, images_tensor): + image_tensor = self.dataset[images_tensor] + return image_tensor + + def _get_masks(self, masks_tensor): + if masks_tensor is None: + return [] + return self.dataset[masks_tensor] + + def _get_iscrowds(self, iscrowds_tensor): + if iscrowds_tensor is not None: + return iscrowds_tensor + + if "iscrowds" in [col.name for col in self.dataset.schema.columns]: + always_warn( + "Iscrowds was not specified, searching for iscrowds tensor in the dataset." + ) + return self.dataset["iscrowds"][:] + always_warn("iscrowds tensor was not found, setting its value to 0.") + return iscrowds_tensor + + def _get_bboxes(self, boxes_tensor): + return self.dataset[boxes_tensor][:] + + def _get_labels(self, labels_tensor): + return self.dataset[labels_tensor][:] + + def _get_class_names(self, labels_tensor): + return self.dataset[labels_tensor].metadata["class_names"] + + def get_ann_info(self, idx): + """Get annotation by index. + + Args: + idx (int): Index of data. + + Raises: + ValueError: when ``self.metrics`` is not valid. + + Returns: + dict: Annotation info of specified index. + """ + bboxes = convert_to_pascal_format( + self.bboxes[idx], self.bbox_info, self.images[idx].shape + ) + return {"bboxes": bboxes, "labels": self.labels[idx]} + + def get_cat_ids(self, idx): + """Get category ids by index. + + Args: + idx (int): Index of data. + + Returns: + list[int]: All categories in the image of specified index. + """ + + cat_ids = self.labels[idx].astype(np.int).tolist() + + return cat_ids + + def _filter_imgs(self, min_size=32): + """Filter images too small.""" + if self.filter_empty_gt: + warnings.warn("CustomDataset does not support filtering empty gt images.") + valid_inds = [] + for i, img_info in enumerate(self.data_infos): + if min(img_info["width"], img_info["height"]) >= min_size: + valid_inds.append(i) + return valid_inds + + def get_classes(self, classes): + """Get class names of current dataset. + + Args: + classes (str): Reresents the name of the classes tensor. Overrides the CLASSES defined by the dataset. + + Returns: + list[str]: Names of categories of the dataset. + """ + return self.dataset[classes].metadata["class_names"] + + def evaluate( + self, + results, + metric="mAP", + logger=None, + proposal_nums=(100, 300, 1000), + iou_thr=0.5, # + scale_ranges=None, + **kwargs, + ): + """Evaluate the dataset. + + Args: + **kwargs (dict): Keyword arguments to pass to self.evaluate object + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + proposal_nums (Sequence[int]): Proposal number used for evaluating + recalls, such as recall@100, recall@1000. + Default: (100, 300, 1000). + iou_thr (float | list[float]): IoU threshold. Default: 0.5. + scale_ranges (list[tuple] | None): Scale ranges for evaluating mAP. + Default: None. + + Raises: + KeyError: if a specified metric format is not supported + + Returns: + OrderedDict: Evaluation metrics dictionary + """ + if self.num_gpus > 1: + results_ordered = [] + for i in range(self.num_gpus): + results_ordered += results[i :: self.num_gpus] + results = results_ordered + + if self.evaluator is None: + if not isinstance(metric, str): + assert len(metric) == 1 + metric = metric[0] + allowed_metrics = ["mAP", "recall"] + if metric not in allowed_metrics: + raise KeyError(f"metric {metric} is not supported") + annotations = [ + self.get_ann_info(i) for i in range(len(self)) + ] # directly evaluate from hub + eval_results = OrderedDict() + iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr + if metric == "mAP": + assert isinstance(iou_thrs, list) + mean_aps = [] + for iou_thr in iou_thrs: + print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}') + mean_ap, _ = eval_map( + results, + annotations, + scale_ranges=scale_ranges, + iou_thr=iou_thr, + dataset=self.CLASSES, + logger=logger, + ) + mean_aps.append(mean_ap) + eval_results[f"AP{int(iou_thr * 100):02d}"] = round(mean_ap, 3) + eval_results["mAP"] = sum(mean_aps) / len(mean_aps) + elif metric == "recall": + gt_bboxes = [ann["bboxes"] for ann in annotations] # evaluate from hub + recalls = eval_recalls( + gt_bboxes, results, proposal_nums, iou_thr, logger=logger + ) + for i, num in enumerate(proposal_nums): + for j, iou in enumerate(iou_thrs): + eval_results[f"recall@{num}@{iou}"] = recalls[i, j] + if recalls.shape[1] > 1: + ar = recalls.mean(axis=1) + for i, num in enumerate(proposal_nums): + eval_results[f"AR@{num}"] = ar[i] + return eval_results + + return self.evaluator.evaluate( + results, + metric=metric, + logger=logger, + proposal_nums=proposal_nums, + **kwargs, + ) + + @staticmethod + def _coco_2_pascal(boxes): + # Convert bounding boxes to Pascal VOC format and clip bounding boxes to make sure they have non-negative width and height + return np.stack( + ( + boxes[:, 0], + boxes[:, 1], + boxes[:, 0] + boxes[:, 2], + boxes[:, 1] + boxes[:, 3], + ), + axis=1, + ) + + def __repr__(self): + """Print the number of instance number.""" + dataset_type = "Test" + # if self.test_mode else "Train" + result = ( + f"\n{self.__class__.__name__} {dataset_type} dataset " + f"with number of images {len(self)}, " + f"and instance counts: \n" + ) + if self.CLASSES is None: + result += "Category names are not provided. \n" + return result + instance_count = np.zeros(len(self.CLASSES) + 1).astype(int) + # count the instance number in each image + for idx in range(len(self)): + label = self.get_ann_info(idx)["labels"] # change this + unique, counts = np.unique(label, return_counts=True) + if len(unique) > 0: + # add the occurrence number to each class + instance_count[unique] += counts + else: + # background is the last index + instance_count[-1] += 1 + # create a table with category count + table_data = [["category", "count"] * 5] + row_data = [] + for cls, count in enumerate(instance_count): + if cls < len(self.CLASSES): + row_data += [f"{cls} [{self.CLASSES[cls]}]", f"{count}"] + else: + # add the background number + row_data += ["-1 background", f"{count}"] + if len(row_data) == 10: + table_data.append(row_data) + row_data = [] + if len(row_data) >= 2: + if row_data[-1] == "0": + row_data = row_data[:-2] + if len(row_data) >= 2: + table_data.append([]) + table_data.append(row_data) + + table = AsciiTable(table_data) + result += table.table + return result + + def format_results(self, results, jsonfile_prefix=None, **kwargs): + """Format the results to json (standard format for COCO evaluation). + + Args: + results (list[tuple | numpy.ndarray]): Testing results of the + dataset. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + kwargs (dict): Additional keyword arguments to be passed. + + Returns: + tuple: (result_files, tmp_dir), result_files is a dict containing + the json filepaths, tmp_dir is the temporal directory created + for saving json files when jsonfile_prefix is not specified. + """ + assert isinstance(results, list), "results must be a list" + assert len(results) == len( + self + ), "The length of results is not equal to the dataset len: {} != {}".format( + len(results), len(self) + ) + + if jsonfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + jsonfile_prefix = osp.join(tmp_dir.name, "results") + else: + tmp_dir = None + result_files = self.results2json(results, jsonfile_prefix) + return result_files, tmp_dir diff --git a/python/deeplake/integrations/mmdet/mmdet_utils_.py b/python/deeplake/integrations/mmdet/mmdet_utils_.py new file mode 100644 index 0000000000..741a1128a8 --- /dev/null +++ b/python/deeplake/integrations/mmdet/mmdet_utils_.py @@ -0,0 +1,498 @@ +import time +import warnings +import pycocotools # type: ignore +import numpy as np +import copy +import itertools +import pycocotools.mask as maskUtils # type: ignore +from pycocotools import coco as pycocotools_coco # type: ignore +from pycocotools import mask as _mask +from collections import defaultdict +import sys +from typing import Union, Dict, List + +PYTHON_VERSION = sys.version_info[0] +if PYTHON_VERSION == 2: + from urllib import urlretrieve # type: ignore +elif PYTHON_VERSION == 3: + from urllib.request import urlretrieve +from mmdet.datasets import coco as mmdet_coco # type: ignore +from mmdet.datasets import pipelines +from deeplake.integrations.mm.warnings import always_warn +from deeplake.types import TypeKind +import json +import mmcv # type: ignore +import math +from tqdm import tqdm + + +def _isArrayLike(obj): + return hasattr(obj, "__iter__") and hasattr(obj, "__len__") + + +class _COCO(pycocotools_coco.COCO): + def __init__( + self, + deeplake_dataset=None, + imgs=None, + masks=None, + masks_type_kind=None, + bboxes=None, + labels=None, + iscrowds=None, + class_names=None, + bbox_format=("LTRB", "pixel"), + ): + """ + Constructor of Microsoft COCO helper class for reading and visualizing annotations. + :param annotation_file (str): location of annotation file + :param image_folder (str): location to the folder that hosts images. + :return: + """ + self.masks = masks + self.masks_type_kind = masks_type_kind + self.bboxes = bboxes + self.labels = labels + self.imgs_orig = imgs + self.iscrowds = iscrowds + self.class_names = class_names + self.bbox_format = bbox_format + + # load dataset + self.anns, self.cats, self.imgs = dict(), dict(), dict() + self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) + print("loading annotations into memory...") + self.dataset = deeplake_dataset + if self.dataset is not None: + self.createDeeplakeIndex() + + def createDeeplakeIndex(self): + # create index + print("creating index...") + anns, cats, imgs = {}, {}, {} + imgToAnns, catToImgs = defaultdict(list), defaultdict(list) + absolute_id = 0 + all_categories = self.labels + all_bboxes = self.bboxes + all_masks = self.masks + all_imgs = self.imgs_orig + all_iscrowds = self.iscrowds + + for row_index, row in tqdm( + enumerate(self.dataset), + desc="loading annotations", + total=len(self.dataset), + ): + if all_imgs[row_index].size == 0: + always_warn( + "found empty image, skipping it. Please verify that your dataset is not corrupted." + ) + continue + categories = all_categories[row_index] # make referencig custom + bboxes = all_bboxes[row_index] + if all_masks != [] and all_masks is not None: + masks = all_masks[row_index] + else: + masks = None + if all_iscrowds is not None: + is_crowds = all_iscrowds[row_index] + else: + is_crowds = np.zeros_like(categories) + img = { + "id": row_index, + "height": all_imgs[row_index].shape[0], + "width": all_imgs[row_index].shape[1], + } + imgs[row_index] = img + for bbox_index, bbox in enumerate(bboxes): + if self.masks is not None and self.masks != []: + if self.masks_type_kind == TypeKind.BinaryMask: + if masks.size == 0: + mask = _mask.encode(np.asfortranarray(masks[:])) + else: + mask = _mask.encode( + np.asfortranarray(masks[..., bbox_index]) + ) + + elif self.masks_type_kind == TypeKind.Polygon: + mask = convert_poly_to_coco_format(masks[:][bbox_index]) + else: + raise Exception( + f"type_kind={self.masks_type_kind} is not supported yet." + ) + ann = { + "image_id": row_index, + "id": absolute_id, + "category_id": categories[bbox_index], + "bbox": bbox, + "area": bbox[2] * bbox[3], + "segmentation": ( + mask if masks is not None else None + ), # optimize here + "iscrowd": int(is_crowds[bbox_index]), + } + + imgToAnns[row_index].append(ann) + anns[absolute_id] = ann + absolute_id += 1 + + category_names = self.class_names # TO DO: add super category names + category_names = [ + {"id": cat_id, "name": name} for cat_id, name in enumerate(category_names) + ] + + for idx, category_name in enumerate(category_names): + cats[idx] = category_name + + for ann in anns.values(): + catToImgs[ann["category_id"]].append(ann["image_id"]) + + # create class members + self.anns = anns + self.imgToAnns = imgToAnns + self.catToImgs = catToImgs + self.imgs = imgs + self.cats = cats + print("create index done!") + + def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): + """ + Get ann ids that satisfy given filter conditions. default skips that filter + :param imgIds (int array) : get anns for given imgs + catIds (int array) : get anns for given cats + areaRng (float array) : get anns for given area range (e.g. [0 inf]) + iscrowd (boolean) : get anns for given crowd label (False or True) + :return: ids (int array) : integer array of ann ids + """ + imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(imgIds) == len(catIds) == len(areaRng) == 0: + anns = list(self.anns.values()) + else: + if not len(imgIds) == 0: + lists = [ + self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns + ] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = list(self.anns.values()) + anns = ( + anns + if len(catIds) == 0 + else [ann for ann in anns if ann["category_id"] in catIds] + ) + anns = ( + anns + if len(areaRng) == 0 + else [ + ann + for ann in anns + if ann["area"] > areaRng[0] and ann["area"] < areaRng[1] + ] + ) + if not iscrowd == None: + ids = [ann["id"] for ann in anns.values() if ann["iscrowd"] == iscrowd] + else: + ids = [ann["id"] for ann in anns] + return ids + + def getCatIds(self, catNms: List = [], supNms: List = [], catIds: List = []): + """Filtering parameters. + + Args: + catNms (List): get cats for given cat names + supNms (List): get classes for given supercategory names + catIds (List): get cats for given cat ids + + Returns: + ids (List[int]): integer array of cat ids + """ + catNms = catNms if _isArrayLike(catNms) else [catNms] + supNms = supNms if _isArrayLike(supNms) else [supNms] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(catNms) == len(supNms) == len(catIds) == 0: + cats = list(self.cats.values()) + else: + cats = list(self.cats.values()) + cats = ( + cats + if len(catNms) == 0 + else [cat for cat in cats if cat["name"] in catNms] + ) + cats = ( + cats + if len(supNms) == 0 + else [cat for cat in cats if cat["supercategory"] in supNms] + ) + cats = ( + cats + if len(catIds) == 0 + else [cat for cat in cats if cat["id"] in catIds] + ) + ids = [cat["id"] for cat in cats] + return ids + + def loadRes(self, resFile): + """ + Load result file and return a result api object. + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = _COCO() + res.dataset = {} + res.dataset["images"] = [img for img in list(self.imgs.values())] + + print("Loading and preparing results...") + tic = time.time() + if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode): + with open(resFile) as f: + anns = json.load(f) + elif type(resFile) == np.ndarray: + anns = self.loadNumpyAnnotations(resFile) + else: + anns = resFile + assert type(anns) == list, "results in not an array of objects" + annsImgIds = [ann["image_id"] for ann in anns] + assert set(annsImgIds) == ( + set(annsImgIds) & set(self.getImgIds()) + ), "Results do not correspond to current coco set" + if "caption" in anns[0]: + imgIds = set([img["id"] for img in res.dataset["images"]]) & set( + [ann["image_id"] for ann in anns] + ) + res.dataset["images"] = [ + img for img in res.dataset["images"] if img["id"] in imgIds + ] + for id, ann in enumerate(anns): + ann["id"] = id + 1 + elif "bbox" in anns[0] and not anns[0]["bbox"] == []: + res.dataset["categories"] = copy.deepcopy(list(self.cats.values())) + for id, ann in enumerate(anns): + bb = ann["bbox"] + x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] + if not "segmentation" in ann: + ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]] + ann["area"] = bb[2] * bb[3] + ann["id"] = id + 1 + ann["iscrowd"] = 0 + elif "segmentation" in anns[0]: + res.dataset["categories"] = copy.deepcopy(list(self.cats.values())) + for id, ann in enumerate(anns): + # now only support compressed RLE format as segmentation results + ann["area"] = maskUtils.area(ann["segmentation"]) + if not "bbox" in ann: + ann["bbox"] = maskUtils.toBbox(ann["segmentation"]) + ann["id"] = id + 1 + ann["iscrowd"] = 0 + elif "keypoints" in anns[0]: + res.dataset["categories"] = copy.deepcopy(list(self.cats.values())) + for id, ann in enumerate(anns): + s = ann["keypoints"] + x = s[0::3] + y = s[1::3] + x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y) + ann["area"] = (x1 - x0) * (y1 - y0) + ann["id"] = id + 1 + ann["bbox"] = [x0, y0, x1 - x0, y1 - y0] + print("DONE (t={:0.2f}s)".format(time.time() - tic)) + + res.dataset["annotations"] = anns + res.createIndex() + return res + + +class DeeplakeCOCO(_COCO): + """This class is almost the same as official pycocotools package. + + It implements some snake case function aliases. So that the COCO class has + the same interface as LVIS class. + """ + + def __init__( + self, + deeplake_dataset=None, + imgs=None, + masks=None, + masks_type_kind=None, + bboxes=None, + labels=None, + iscrowds=None, + class_names=None, + bbox_format=("LTRB", "pixel"), + ): + if getattr(pycocotools, "__version__", "0") >= "12.0.2": + warnings.warn( + 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501 + UserWarning, + ) + super().__init__( + deeplake_dataset=deeplake_dataset, + imgs=imgs, + masks=masks, + masks_type_kind=masks_type_kind, + labels=labels, + bboxes=bboxes, + iscrowds=iscrowds, + class_names=class_names, + bbox_format=bbox_format, + ) + self.img_ann_map = self.imgToAnns + self.cat_img_map = self.catToImgs + + def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None): + return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd) + + def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]): + return self.getCatIds(cat_names, sup_names, cat_ids) + + def get_img_ids(self, img_ids=[], cat_ids=[]): + return self.getImgIds(img_ids, cat_ids) + + def load_anns(self, ids): + return self.loadAnns(ids) + + def load_cats(self, ids): + return self.loadCats(ids) + + def load_imgs(self, ids): + return self.loadImgs(ids) + + +class COCODatasetEvaluater(mmdet_coco.CocoDataset): + def __init__( + self, + pipeline, + deeplake_dataset=None, + classes=None, + img_prefix="", + seg_prefix=None, + seg_suffix=".png", + proposal_file=None, + test_mode=True, + filter_empty_gt=True, + file_client_args=dict(backend="disk"), + imgs=None, + masks=None, + masks_type_kind=None, + bboxes=None, + labels=None, + iscrowds=None, + bbox_format=None, + batch_size=1, + num_gpus=1, + ): + self.img_prefix = img_prefix + self.seg_prefix = seg_prefix + self.seg_suffix = seg_suffix + self.proposal_file = proposal_file + self.test_mode = test_mode + self.filter_empty_gt = filter_empty_gt + self.file_client = mmcv.FileClient(**file_client_args) + self.CLASSES = classes + self.batch_size = batch_size + self.num_gpus = num_gpus + self.masks_type_kind = masks_type_kind + + self.data_infos = self.load_annotations( + deeplake_dataset, + imgs=imgs, + labels=labels, + masks=masks, + masks_type_kind=self.masks_type_kind, + bboxes=bboxes, + iscrowds=iscrowds, + class_names=self.CLASSES, + bbox_format=bbox_format, + ) + self.proposals = None + + # filter images too small and containing no annotations + if not test_mode: + valid_inds = self._filter_imgs() + self.data_infos = [self.data_infos[i] for i in valid_inds] + if self.proposals is not None: + self.proposals = [self.proposals[i] for i in valid_inds] + # set group flag for the sampler + self._set_group_flag() + + # processing pipeline + + def pipeline(self, x): + return x + + def __len__(self): + length = super().__len__() + per_gpu_length = math.floor(length / (self.batch_size * self.num_gpus)) + total_length = per_gpu_length * self.num_gpus + return total_length + + def load_annotations( + self, + deeplake_dataset, + imgs=None, + labels=None, + masks=None, + masks_type_kind=None, + bboxes=None, + iscrowds=None, + class_names=None, + bbox_format=None, + ): + """Load annotation from COCO style annotation file. + + Args: + deeplake_dataset (dp.Dataset): Deeplake dataset object. + imgs (dp.Tensor): image deeplake tensor. + labels (List[numpy]): List of labels for every every detection for each image in numpy format. + masks (List[numpy]): List of masks for every every detection for each image in numpy format. + bboxes (List[numpy]): List of bboxes for every every detection for each image in numpy. + iscrowds (List[numpy]): List of iscrowds for every every detection for each image in numpy format. + class_names (List[str]): List of class names for every every detection for each image. + bbox_format (Dict[Dict[str, str]]): Dictionary contatining bbox format information. + + Returns: + list[dict]: Annotation info from COCO api. + """ + + self.coco = DeeplakeCOCO( + deeplake_dataset, + imgs=imgs, + labels=labels, + bboxes=bboxes, + masks=masks, + masks_type_kind=masks_type_kind, + iscrowds=iscrowds, + class_names=class_names, + bbox_format=bbox_format, + ) + # The order of returned `cat_ids` will not + # change with the order of the CLASSES + self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) + + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.img_ids = self.coco.get_img_ids() + data_infos = [] + total_ann_ids = [] + for i in self.img_ids: + info = self.coco.load_imgs([i])[0] + data_infos.append(info) + ann_ids = self.coco.get_ann_ids(img_ids=[i]) + total_ann_ids.extend(ann_ids) + assert len(set(total_ann_ids)) == len(total_ann_ids) + return data_infos + + +def convert_poly_to_coco_format(masks): + if isinstance(masks, np.ndarray): + px = masks[..., 0] + py = masks[..., 1] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [[float(p) for x in poly for p in x]] + return poly + poly = [] + for mask in masks: + poly_i = convert_poly_to_coco_format(mask) + poly.append([np.array(poly_i[0])]) + return poly diff --git a/python/deeplake/integrations/mmdet/test_.py b/python/deeplake/integrations/mmdet/test_.py new file mode 100644 index 0000000000..c574bbc253 --- /dev/null +++ b/python/deeplake/integrations/mmdet/test_.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import mmcv +import torch +import torch.distributed as dist +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + +from mmdet.core import encode_mask_results +from mmdet.utils import get_device + + +def single_gpu_test( + model, + data_loader, + show=False, + out_dir=None, + show_score_thr=0.3, + show_box_only=False, + show_mask_only=False, +): + model.eval() + results = [] + dataset = data_loader.dataset.mmdet_dataset + PALETTE = getattr(dataset, "PALETTE", None) + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + + batch_size = len(result) + if show or out_dir: + if batch_size == 1 and isinstance(data["img"][0], torch.Tensor): + img_tensor = data["img"][0] + else: + img_tensor = data["img"][0].data[0] + img_metas = data["img_metas"][0].data[0] + imgs = tensor2imgs(img_tensor, **img_metas[0]["img_norm_cfg"]) + assert len(imgs) == len(img_metas) + + for i, (img, img_meta) in enumerate(zip(imgs, img_metas)): + h, w, _ = img_meta["img_shape"] + img_show = img[:h, :w, :] + + ori_h, ori_w = img_meta["ori_shape"][:-1] + img_show = mmcv.imresize(img_show, (ori_w, ori_h)) + + if out_dir: + out_file = osp.join(out_dir, img_meta["ori_filename"]) + else: + out_file = None + + model.module.show_result( + img_show, + result[i], + bbox_color=PALETTE, + text_color=PALETTE, + mask_color=PALETTE, + show=show, + out_file=out_file, + score_thr=show_score_thr, + show_box_only=show_box_only, + show_mask_only=show_mask_only, + ) + + # encode mask results + if isinstance(result[0], tuple): + result = [ + (bbox_results, encode_mask_results(mask_results)) + for bbox_results, mask_results in result + ] + # This logic is only used in panoptic segmentation test. + elif isinstance(result[0], dict) and "ins_results" in result[0]: + for j in range(len(result)): + bbox_results, mask_results = result[j]["ins_results"] + result[j]["ins_results"] = ( + bbox_results, + encode_mask_results(mask_results), + ) + + results.extend(result) + + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset.mmdet_dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + # encode mask results + if isinstance(result[0], tuple): + result = [ + (bbox_results, encode_mask_results(mask_results)) + for bbox_results, mask_results in result + ] + # This logic is only used in panoptic segmentation test. + elif isinstance(result[0], dict) and "ins_results" in result[0]: + for j in range(len(result)): + bbox_results, mask_results = result[j]["ins_results"] + result[j]["ins_results"] = ( + bbox_results, + encode_mask_results(mask_results), + ) + + results.extend(result) + + if rank == 0: + batch_size = len(result) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + default_device = get_device() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full( + (MAX_LEN,), 32, dtype=torch.uint8, device=default_device + ) + if rank == 0: + mmcv.mkdir_or_exist(".dist_test") + tmpdir = tempfile.mkdtemp(dir=".dist_test") + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device=default_device + ) + dir_tensor[: len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f"part_{rank}.pkl")) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f"part_{i}.pkl") + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + rank, world_size = get_dist_info() + default_device = get_device() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device=default_device + ) + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device=default_device) + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device=default_device) + part_send[: shape_tensor[0]] = part_tensor + part_recv_list = [part_tensor.new_zeros(shape_max) for _ in range(world_size)] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append(pickle.loads(recv[: shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/python/deeplake/integrations/mmseg/__init__.py b/python/deeplake/integrations/mmseg/__init__.py new file mode 100644 index 0000000000..29d7586f84 --- /dev/null +++ b/python/deeplake/integrations/mmseg/__init__.py @@ -0,0 +1,2 @@ +from deeplake.integrations.mmseg.mmseg_ import train_segmentor +from mmseg.models import build_segmentor # type: ignore diff --git a/python/deeplake/integrations/mmseg/compose_transform_.py b/python/deeplake/integrations/mmseg/compose_transform_.py new file mode 100644 index 0000000000..0dabdc657b --- /dev/null +++ b/python/deeplake/integrations/mmseg/compose_transform_.py @@ -0,0 +1,78 @@ +import io +import numpy as np +from typing import Callable, Optional, List +from functools import partial + +from deeplake.integrations.mm.exceptions import InvalidImageError, InvalidSegmentError +from deeplake.integrations.mm.upcast_array import upcast_array +from mmcv.utils import build_from_cfg +from mmseg.datasets.builder import PIPELINES # type: ignore +from mmseg.datasets.pipelines import Compose # type: ignore + + +def build_pipeline(steps): + return Compose( + [ + build_from_cfg(step, PIPELINES, None) + for step in steps + if step["type"] not in {"LoadImageFromFile", "LoadAnnotations"} + ] + ) + + +def transform( + sample_in, + images_tensor: str, + masks_tensor: str, + pipeline: Callable, +): + try: + img = upcast_array(sample_in[images_tensor]) + except Exception as e: + raise InvalidImageError(images_tensor, e) + if isinstance(img, (bytes, bytearray)): + img = np.array(Image.open(io.BytesIO(img))) + elif not isinstance(img, np.ndarray): + img = np.array(img) + + try: + mask = sample_in[masks_tensor] + except Exception as e: + raise InvalidSegmentMaskError(images_tensor, e) + if not isinstance(mask, np.ndarray): + mask = np.array(mask) + + if img.ndim == 2: + img = np.expand_dims(img, -1) + + img = img[..., ::-1] # rgb_to_bgr should be optional + if img.shape[2] == 1: + img = np.repeat(img, 3, axis=2) + shape = img.shape + + pipeline_dict = { + "img": np.ascontiguousarray(img, dtype=np.float32), + "img_fields": ["img"], + "filename": None, + "ori_filename": None, + "img_shape": shape, + "ori_shape": shape, + "gt_semantic_seg": np.ascontiguousarray(mask, np.int64), + "seg_fields": ["gt_semantic_seg"], + } + + return pipeline(pipeline_dict) + + +def compose_transform( + images_tensor: str, + masks_tensor: Optional[str], + pipeline: List, +): + pipeline = build_pipeline(pipeline) + return partial( + transform, + images_tensor=images_tensor, + masks_tensor=masks_tensor, + pipeline=pipeline, + ) diff --git a/python/deeplake/integrations/mmseg/mmseg_.py b/python/deeplake/integrations/mmseg/mmseg_.py new file mode 100644 index 0000000000..be709fdbe7 --- /dev/null +++ b/python/deeplake/integrations/mmseg/mmseg_.py @@ -0,0 +1,739 @@ +""" +Deep Lake offers an integration with MMSegmentation, a popular open-source semantic segmentation toolbox based on PyTorch. +The integration enables users to train models while streaming Deep Lake dataset using the transformation, training, and evaluation tools built by MMSeg. + +Learn more about MMSegmentation `here `_. + +Integration Interface +~~~~~~~~~~~~~~~~~~~~~ +MMSegmentation works with configs. Deeplake adopted this strategy, and in order to train MMSeg models, you need to create/specify your model +and training/validation config. Deep Lake integration's logic is almost the same as MMSegmentation's with some minor modifications. The integrations +with MMSeg occurs in the deeplake.integrations.mmseg module. At a high-level, Deep Lake is responsible for the pytorch dataloader that streams data +to the training framework, while MMSeg is used for the training, transformation, and evaluation logic. Let us take a look at the config with deeplake changes: + +Deeplake integration requires the following parameters to be specified in the configuration file: + +- ``data``: Just like in the MMSegmentation configuration files, in data dictionary you can specify everything that you want to be applied to the data during training and validation + - ``train``: Keyword argument of data, a dictionary where one can specify dataset path, credentials, transformations of the training data + - ``val``: Keyword argument of data, a dictionary where one can specify dataset path, credentials, transformations of the validation data + - ``pipeline``: List of transformations. This parameter exists for train as well as for val. + + - Example: + + >>> pipeline = [dict(type="Resize", img_scale=[(320, 320), (608, 608)], keep_ratio=True), dict(type="RandomFlip", flip_ratio=0.5), dict(type="PhotoMetricDistortion")] + + - ``deeplake_path``: Path to the deeplake dataset. This parameter exists for train as well as for val. + - ``deeplake_credentials``: Optional parameter. Required only when using private nonlocal datasets. See documendataion for `deeplake.open_read_only() https://docs.deeplake.ai/latest/api/dataset/#deeplake.open_read_only`_ for details. This parameter exists for train as well as for val. + - ``deeplake_tag_id``: Optional parameter. If specified, the dataset will checkout to the commit. This parameter exists for train as well as for val. See documentation for `Dataset.commit_id `_ + - ``deeplake_query``: Optional paramterer. If specified, the dataset can be loaded from the query is dataset_path was not been specified as well as the applied on that dataset of dataset_path was specified before + - ``deeplake_tensors``: Optional parameter. If specified maps MMSegmentation tensors to the associated tensors in the dataset. MMSeg tensors are: "img", "gt_semantic_seg". This parameter exists for train as well as for val. + - ``"img"``: Stands for image tensor. + - ``"gt_semantic_seg"``: Stands for semantic segmenataion tensor. + + - ``deeplake_dataloader``: Optional parameter. If specified represents the parameters of the deeplake dataloader. Deeplake dataloader parameters are: "shuffle", "batch_size", "num_workers". This parameter exists for train as well as for val. + - ``"shuffle"``: If ``True`` shuffles the dataset. + - ``"batch_size"``: Size of batch. If not specified, dataloader will use ``samples_per_gpu``. + - ``"num_workers"``: Number of workers to use. If not specified, dataloader will use ``workers_per_gpu``. + +Example: + +>>> evaluation = dict(metric=["mIoU"], interval=1) + +- ``train_segmentor``: Function to train the MMSegmentation model. + + Parameters: + + - ``model``: MMSegmentation model that is going to be used. + - ``cfg``: mmcv.ConfigDict, Configuration of the model as well as of the datasets and transforms that's going to be used. + - ``ds_train``: Optional parameter. If provided will overwrite deeplake_path in train, and will pass this tensor directly to the dataloader. + - ``ds_val``: Optional parameter. If provided will overwrite deeplake_path in val, and will pass this tensor directly to the dataloader. + - ``ds_train_tensors``: Optional parameter. If provided will overwrite deeplake_tensors in train, and will pass this tensor mapping directly to dataloader. + - ``ds_val_tensors``: Optional parameter. If provided will overwrite deeplake_tensors in val, and will pass this tensor mapping directly to dataloader. + - ``distributed``: Optional parameter. If provided will run the code on all available gpus. Meta data used to build runner. + - ``timestamp``: Variable used in runner to make .log and .log.json filenames the same. + - ``validate``: Bool, whether validation should be run, defaults to ``True``. + + +MMSegmentation Config Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Below is the example of the deeplake mmseg configuration: + + +>>> _base_ = "../mmsegmentation/configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py" +>>> # use caffe img_norm +>>> img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True) +>>> train_pipeline = [ +... dict(type='LoadImageFromFile'), +... dict(type='LoadAnnotations'), +... dict( +... type='Expand', +... mean=img_norm_cfg['mean'], +... to_rgb=img_norm_cfg['to_rgb'], +... ratio_range=(1, 2)), +... dict(type='Resize', img_scale=[(320, 320), (416, 416)], keep_ratio=True), +... dict(type='RandomFlip', flip_ratio=0.0), +... dict(type='PhotoMetricDistortion'), +... dict(type='Normalize', **img_norm_cfg), +... dict(type='Pad', size_divisor=32), +... dict(type='DefaultFormatBundle'), +... dict(type='Collect', keys=['img', 'gt_semantic_seg']) +... ] +>>> test_pipeline = [ +... dict(type='LoadImageFromFile'), +... dict( +... type='MultiScaleFlipAug', +... img_scale=(416, 416), +... flip=False, +... transforms=[ +... dict(type='Resize', keep_ratio=True), +... dict(type='RandomFlip', flip_ratio=0.0), +... dict(type='Normalize', **img_norm_cfg), +... dict(type='Pad', size_divisor=32), +... dict(type='ImageToTensor', keys=['img']), +... dict(type='Collect', keys=['img']) +... ]) +... ] +>>> #--------------------------------------DEEPLAKE INPUTS------------------------------------------------------------# +>>> TOKEN = "INSERT_YOUR_DEEPLAKE_TOKEN" +>>> data = dict( +... # samples_per_gpu=4, # Is used instead of batch_size if deeplake_dataloader is not specified below +... # workers_per_gpu=8, # Is used instead of num_workers if deeplake_dataloader is not specified below +... train=dict( +... pipeline=train_pipeline, +... # Credentials for authentication. See documendataion for deeplake.load() for details +... deeplake_path="hub://activeloop/semantic-seg-train", +... deeplake_credentials={ +... "token": TOKEN, +... "creds": None, +... }, +... #OPTIONAL - Checkout the specified commit_id before training +... deeplake_commit_id="", +... #OPTIONAL - Loads a dataset tag for training based on tag_id +... deeplake_tag_id="", +... # OPTIONAL - {"mmseg_key": "deep_lake_tensor",...} - Maps Deep Lake tensors to MMSeg dictionary keys. +... # If not specified, Deep Lake will auto-infer the mapping, but it might make mistakes if datasets have many tensors +... deeplake_tensors = {"img": "images", "gt_semantic_seg": "semantic_seg"}, +... # OPTIONAL - Parameters to use for the Deep Lake dataloader. If unspecified, the integration uses +... # the parameters in other parts of the cfg file such as samples_per_gpu, and others. +... deeplake_dataloader = {"shuffle": True, "batch_size": 4, 'num_workers': 8} +... ), +... # Parameters as the same as for train +... val=dict( +... pipeline=test_pipeline, +... deeplake_path="hub://activeloop/semantic-seg-val", +... deeplake_credentials={ +... "token": TOKEN, +... "creds": None, +... }, +... deeplake_tensors = {"img": "images", "gt_semantic_seg": "semantic_seg"}, +... deeplake_dataloader = {"shuffle": False, "batch_size": 1, 'num_workers': 8} +... ), +... ) +>>> # Which dataloader to use +>>> # Which metrics to use for evaulation. In MMSeg (without Deeplake), this is inferred from the dataset type. +>>> # In the Deep Lake integration, since the format is standardized, a variety of metrics can be used for a given dataset. +>>> #----------------------------------END DEEPLAKE INPUTS------------------------------------------------------------# + +And config for training: + +>>> import os +>>> from mmcv import Config +>>> import mmcv +>>> from deeplake.integrations import mmseg as mmseg_deeplake +>>> cfg = Config.fromfile(cfg_file) +>>> # Build the segmentor +>>> model = mmseg_deeplake.build_segmentor(cfg.model) +>>> # Create work_dir +>>> mmcv.mkdir_or_exist(os.path.abspath(cfg.work_dir)) +>>> # Run the training +>>> mmseg_deeplake.train_segmentor(model, cfg, distributed=args.distributed, validate=args.validate) +""" + +import warnings +import torch +import numpy as np +import io +import math +import types +from functools import partial + +from typing import Callable, Optional, List, Dict, Sequence, Union +from PIL import Image # type: ignore + +from mmseg.core import DistEvalHook, EvalHook # type: ignore +from mmseg.core import build_optimizer +from mmseg.utils import ( # type: ignore + build_dp, + find_latest_checkpoint, + get_root_logger, +) + +from mmseg.datasets.samplers import DistributedSampler # type: ignore +from mmseg.utils.util_distribution import * # type: ignore +from deeplake.integrations.mm.get_indexes import get_indexes +from deeplake.integrations.mm.worker_init_fn import worker_init_fn +from deeplake.integrations.mm.ipc import _get_free_port +from deeplake.integrations.mm.exceptions import ValidationDatasetMissingError + +from mmcv.utils import build_from_cfg, digit_version # type: ignore +from mmcv.parallel import collate # type: ignore +import mmcv # type: ignore +from mmcv.runner import init_dist # type: ignore +from mmcv.runner import ( # type: ignore + DistSamplerSeedHook, + EpochBasedRunner, + OptimizerHook, + build_runner, + get_dist_info, + HOOKS, +) + + +import deeplake as dp +from deeplake.types import TypeKind +from deeplake.integrations.mm.warnings import always_warn + +from deeplake.integrations.mm.mm_runners import DeeplakeIterBasedRunner +from deeplake.integrations.mm.mm_common import ( + load_ds_from_cfg, + get_collect_keys, + check_persistent_workers, + find_image_tensor, + find_smask_tensor, + ddp_setup, + force_cudnn_initialization, + check_unsupported_functionalities, + get_pipeline, +) +from deeplake.integrations.mmseg.mmseg_dataset_ import MMSegDataset, MMSegTorchDataset +from deeplake.integrations.mmseg.compose_transform_ import compose_transform + +from torch.utils.data import DataLoader, IterableDataset + + +# Monkey-patch the function +from deeplake.integrations.mmseg.test_ import single_gpu_test as custom_single_gpu_test +from deeplake.integrations.mmseg.test_ import multi_gpu_test as custom_multi_gpu_test + +import mmseg.apis + +mmseg.apis.single_gpu_test = custom_single_gpu_test +mmseg.apis.multi_gpu_test = custom_multi_gpu_test + + +def build_ddp(model, device, *args, **kwargs): + """Build DistributedDataParallel module by device type. + + If device is cuda, return a MMDistributedDataParallel model; + if device is mlu, return a MLUDistributedDataParallel model. + + Args: + model (:class:`nn.Module`): module to be parallelized. + device (str): device type, mlu or cuda. + args (List): arguments to be passed to ddp_factory + kwargs (dict): keyword arguments to be passed to ddp_factory + + Returns: + :class:`nn.Module`: the module to be parallelized + + References: + .. [1] https://pytorch.org/docs/stable/generated/torch.nn.parallel. + DistributedDataParallel.html + """ + + assert device in ["cuda", "mlu"], "Only available for cuda or mlu devices." + if device == "cuda": + model = model.cuda(kwargs["device_ids"][0]) # patch + elif device == "mlu": + from mmcv.device.mlu import MLUDistributedDataParallel # type: ignore + + ddp_factory["mlu"] = MLUDistributedDataParallel + model = model.mlu() + + return ddp_factory[device](model, *args, **kwargs) + + +def mmseg_subiterable_dataset_eval( + self, + *args, + **kwargs, +): + return self.dataset.mmseg_dataset.evaluate(*args, **kwargs) + + +def train_segmentor( + model, + cfg: mmcv.ConfigDict, + ds_train=None, + ds_train_tensors=None, + ds_val: Optional[dp.Dataset] = None, + ds_val_tensors=None, + distributed: bool = False, + timestamp=None, + meta=None, + validate: bool = True, +): + """ + Creates runner and trains evaluates the model: + Args: + model: model to train, should be built before passing + cfg: mmcv.ConfigDict object containing all necessary configuration. + In cfg we have several changes to support deeplake integration: + _base_: still serves as a base model to inherit from + data: where everything related to data processing, you will need to specify the following parameters: + train: everything related to training data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmseg keys to deeplake dataset tensor. Example: `{"img": "images", "gt_semantic_seg": "semantic_seg"}`. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image" and "segment_mask". + keys that needs to be mapped are: `img` and "gt_semantic_seg". `img` and `gt_semantic_seg` are always required, if they not specified they + are always searched, if you specify in collect `gt_semantic_seg` then you need to either specify it in config or it will be searched based on + `segment_mask` htype. + deeplake_credentials: dictionary with deeplake credentials that allow you to access the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal account on: https://www.activeloop.ai/. + val (Optional): everything related to validating data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmseg keys to deeplake dataset tensor. Example: `{"img": "images", "gt_semantic_seg": "semantic_seg"}`. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image" and "segment_mask". + keys that needs to be mapped are: `img` and "gt_semantic_seg". `img` and `gt_semantic_seg` are always required, if they not specified they + are always searched, if you specify in collect `gt_semantic_seg` then you need to either specify it in config or it will be searched based on + `segment_mask` htype. + deeplake_credentials: deeplake credentials that allow you to access the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal account on: https://www.activeloop.ai/. + test (Optional): everything related to testing data, it has the following attributes: + pipeline: dictionary where all training augmentations and transformations should be specified, like in mmdet + deeplake_tensors: dictionary that maps mmseg keys to deeplake dataset tensor. Example: `{"img": "images", "gt_semantic_seg": "semantic_seg"}`. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image" and "segment_mask". + keys that needs to be mapped are: `img` and "gt_semantic_seg". `img` and `gt_semantic_seg` are always required, if they not specified they + are always searched, if you specify in collect `gt_semantic_seg` then you need to either specify it in config or it will be searched based on + `segment_mask` htype. + deeplake_credentials: deeplake credentials that allow you to access the specified data. It has following arguments: `token`. + `token` is the token that gives you read or write access to the datasets. It is available in your personal acccount on: https://www.activeloop.ai/. + samples_per_gpu: number of samples to be processed per gpu + workers_per_gpu: number of workers per gpu + optimizer: dictionary containing information about optimizer initialization + optimizer_config: some optimizer configuration that might be used during training like grad_clip etc. + runner: training type e.g. EpochBasedRunner, here you can specify maximum number of epochs to be conducted. For instance: `runner = dict(type='EpochBasedRunner', max_epochs=273)` + ds_train: train dataset of type dp.Dataset. This can be a view of the dataset. + ds_train_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image" and "segment_mask". + keys that needs to be mapped are: `img` and "gt_semantic_seg". `img` and `gt_semantic_seg` are always required, if they not specified they + are always searched, if you specify in collect `gt_semantic_seg` then you need to either specify it in config or it will be searched based on + `segment_mask` htype. + ds_val: validation dataset of type dp.Dataset. This can be view of the dataset. + ds_val_tensors: dictionary that maps mmdet keys to deeplake dataset tensor. Example: {"img": "images", "gt_bboxes": "boxes", "gt_labels": "categories"}. + If this dictionary is not specified, these tensors will be searched automatically using htypes like "image" and "segment_mask". + keys that needs to be mapped are: `img` and "gt_semantic_seg". `img` and `gt_semantic_seg` are always required, if they not specified they + are always searched, if you specify in collect `gt_semantic_seg` then you need to either specify it in config or it will be searched based on + `segment_mask` htype. + evaluation: dictionary that contains all information needed for evaluation apart from data processing, like how often evaluation should be done and what metrics we want to use. + For instance, `evaluation = dict(interval=1, metric=['mIoU'])` + distributed: bool, whether ddp training should be started, by default `False` + timestamp: variable used in runner to make .log and .log.json filenames the same + meta: meta data used to build runner + validate: bool, whether validation should be conducted, by default `True` + """ + check_unsupported_functionalities(cfg) + + if not hasattr(cfg, "gpu_ids"): + cfg.gpu_ids = range(torch.cuda.device_count() if distributed else 1) + if distributed: + return torch.multiprocessing.spawn( + _train_segmentor, + args=( + model, + cfg, + ds_train, + ds_train_tensors, + ds_val, + ds_val_tensors, + distributed, + timestamp, + meta, + validate, + _get_free_port(), + ), + nprocs=len(cfg.gpu_ids), + ) + _train_segmentor( + 0, + model, + cfg, + ds_train, + ds_train_tensors, + ds_val, + ds_val_tensors, + distributed, + timestamp, + meta, + validate, + ) + + +def register_validation_hook_( + batch_size: int, + num_workers: int, + distributed: bool, + cfg: mmcv.ConfigDict, + ignore_index: int, + reduce_zero_label: bool, + train_persistent_workers: bool = False, + ds_val: Optional[dp.Dataset] = None, + ds_val_tensors=None, + runner=None, +): + eval_cfg = cfg.get("evaluation", {}) + val_dataloader_default_args = dict( + samples_per_gpu=batch_size, + workers_per_gpu=num_workers, + dist=distributed, + shuffle=False, + mode="val", + seed=cfg.seed, + num_gpus=len(cfg.gpu_ids), + ignore_index=ignore_index, + reduce_zero_label=reduce_zero_label, + ) + + val_dataloader_args = { + **cfg.data.val.get("deeplake_dataloader", {}), + **val_dataloader_default_args, + } + + val_persistent_workers = val_dataloader_args.get("persistent_workers", False) + check_persistent_workers(train_persistent_workers, val_persistent_workers) + + if val_dataloader_args.get("shuffle", False): + always_warn("shuffle argument for validation dataset will be ignored.") + + if ds_val is None: + cfg_ds_val = cfg.data.get("val") + if not cfg_ds_val or not any( + cfg_ds_val.get(key) is not None + for key in ["deeplake_path", "deeplake_query"] + ): + raise ValidationDatasetMissingError() + ds_val = load_ds_from_cfg(cfg.data.val) + ds_val_tensors = cfg.data.val.get("deeplake_tensors", {}) + else: + cfg_data = cfg.data.val.get("deeplake_path") + if cfg_data is not None: + always_warn( + "A Deep Lake dataset was specified in the cfg as well as in the dataset input to train_segmentor. The dataset input to train_segmentor will be used in the workflow." + ) + + if ds_val is None: + raise ValidationDatasetMissingError() + + if ds_val_tensors: + val_images_tensor = ds_val_tensors["img"] + val_masks_tensor = ds_val_tensors.get("gt_semantic_seg") + else: + val_images_tensor = find_image_tensor(ds_val, mm_class="img") + val_masks_tensor = None + collection_keys = get_collect_keys(cfg) + if "gt_semantic_seg" in collection_keys: + val_masks_tensor = find_smask_tensor(ds_val, mm_class="gt_semantic_seg") + + val_pipeline = get_pipeline(cfg, name="val", generic_name="test_pipeline") + + val_dataloader = build_dataloader( + ds_val, + val_images_tensor, + val_masks_tensor, + pipeline=val_pipeline, + **val_dataloader_args, + ) + + eval_cfg["by_epoch"] = cfg.runner["type"] != "DeeplakeIterBasedRunner" + eval_cfg["pre_eval"] = False + eval_hook = EvalHook + if distributed: + eval_hook = DistEvalHook + # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the + # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. + runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority="LOW") + + +def _train_segmentor( + local_rank, + model, + cfg: mmcv.ConfigDict, + ds_train=None, + ds_train_tensors=None, + ds_val: Optional[dp.Dataset] = None, + ds_val_tensors=None, + distributed: bool = False, + timestamp=None, + meta=None, + validate: bool = True, + port=None, +): + batch_size = cfg.data.get("samples_per_gpu", 256) + num_workers = cfg.data.get("workers_per_gpu", 1) + + ignore_index = cfg.get("ignore_index", 255) + reduce_zero_label = cfg.get("reduce_zero_label", False) + + if ds_train is None: + ds_train = load_ds_from_cfg(cfg.data.train) + ds_train_tensors = cfg.data.train.get("deeplake_tensors", {}) + else: + cfg_data = cfg.data.train.get("deeplake_path") + if cfg_data: + always_warn( + "A Deep Lake dataset was specified in the cfg as well as in the dataset input to train_segmentor. The dataset input to train_segmentor will be used in the workflow." + ) + + if ds_train_tensors: + train_images_tensor = ds_train_tensors["img"] + train_masks_tensor = ds_train_tensors.get("gt_semantic_seg") + else: + train_images_tensor = find_image_tensor(ds_train, mm_class="img") + train_masks_tensor = None + + collection_keys = get_collect_keys(cfg) + if "gt_semantic_seg" in collection_keys: + train_masks_tensor = find_smask_tensor(ds_train, mm_class="gt_semantic_seg") + + model.CLASSES = ds_train[train_masks_tensor].metadata["class_names"] + + logger = get_root_logger(log_level=cfg.log_level) + runner_type = "EpochBasedRunner" if "runner" not in cfg else cfg.runner["type"] + + train_dataloader_default_args = dict( + samples_per_gpu=batch_size, + workers_per_gpu=num_workers, + # `num_gpus` will be ignored if distributed + num_gpus=len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + runner_type=runner_type, + ignore_index=ignore_index, + reduce_zero_label=reduce_zero_label, + ) + + train_loader_cfg = { + **train_dataloader_default_args, + **cfg.data.get("train_dataloader", {}), + **cfg.data.train.get("deeplake_dataloader", {}), + } + + # # put model on gpus + if distributed: + find_unused_parameters = cfg.get("find_unused_parameters", False) + # Sets the `find_unused_parameters` parameter in + # # torch.nn.parallel.DistributedDataParallel + # model = torch.nn.parallel.DistributedDataParallel(model.cuda(), + # device_ids=[local_rank], + # output_device=local_rank, + # broadcast_buffers=False, + # find_unused_parameters=find_unused_parameters) + force_cudnn_initialization(cfg.gpu_ids[local_rank]) + ddp_setup(local_rank, len(cfg.gpu_ids), port) + model = build_ddp( + model, + cfg.device, + device_ids=[cfg.gpu_ids[local_rank]], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters, + ) + else: + model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) + + train_pipeline = get_pipeline(cfg, name="train", generic_name="train_pipeline") + + data_loader = build_dataloader( + ds_train, + train_images_tensor, + train_masks_tensor, + pipeline=train_pipeline, + **train_loader_cfg, + ) + + # build optimizer + optimizer = build_optimizer(model, cfg.optimizer) + + # check runner + cfg.custom_imports = dict( + imports=["deeplake.integrations.mm.mm_runners"], + allow_failed_imports=False, + ) + if cfg.runner.type == "IterBasedRunner": + cfg.runner.type = "DeeplakeIterBasedRunner" + elif cfg.runner.type == "EpochBasedRunner": + cfg.runner.type = "DeeplakeEpochBasedRunner" + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta, + force_cleanup=False, + ), + ) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + if distributed and "type" not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks( + cfg.lr_config, + optimizer_config, + cfg.checkpoint_config, + cfg.log_config, + cfg.get("momentum_config", None), + ) + + if distributed and isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + register_validation_hook_( + batch_size=batch_size, + num_workers=num_workers, + distributed=distributed, + train_persistent_workers=train_loader_cfg.get("persistent_workers", False), + cfg=cfg, + ignore_index=ignore_index, + reduce_zero_label=reduce_zero_label, + ds_val=ds_val, + ds_val_tensors=ds_val_tensors, + runner=runner, + ) + + # user-defined hooks + if cfg.get("custom_hooks", None): + custom_hooks = cfg.custom_hooks + assert isinstance( + custom_hooks, list + ), f"custom_hooks expect list type, but got {type(custom_hooks)}" + for hook_cfg in cfg.custom_hooks: + assert isinstance(hook_cfg, dict), ( + "Each item in custom_hooks expects dict type, but got " + f"{type(hook_cfg)}" + ) + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop("priority", "NORMAL") + hook = build_from_cfg(hook_cfg, HOOKS) + runner.register_hook(hook, priority=priority) + + resume_from = None + if cfg.resume_from is None and cfg.get("auto_resume"): + resume_from = find_latest_checkpoint(cfg.work_dir) + if resume_from is not None: + cfg.resume_from = resume_from + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run([data_loader], cfg.workflow) + + +def build_dataloader( + dataset: dp.Dataset, + images_tensor: str, + masks_tensor: Optional[str], + pipeline: List, + mode: str = "train", + **loader_config, +): + persistent_workers = loader_config.get("persistent_workers", False) + _ = loader_config.get("ignore_index") + _ = loader_config.get("reduce_zero_label") + dist = loader_config["dist"] + seed = loader_config["seed"] + transform_fn = compose_transform( + images_tensor=images_tensor, masks_tensor=masks_tensor, pipeline=pipeline + ) + + num_workers = loader_config.get("num_workers") + pin_memory = loader_config.get("pin_memory", False) + if num_workers is None: + num_workers = loader_config["workers_per_gpu"] + + shuffle = loader_config.get("shuffle", True) + + tensors_dict = { + "images_tensor": images_tensor, + } + tensors = [images_tensor] + if masks_tensor is not None: + tensors.append(masks_tensor) + tensors_dict["masks_tensor"] = masks_tensor + + batch_size = loader_config.get("batch_size") + drop_last = loader_config.get("drop_last", False) + if batch_size is None: + batch_size = loader_config["samples_per_gpu"] + + collate_fn = partial(collate, samples_per_gpu=batch_size) + + mmseg_ds = MMSegDataset( + dataset=dataset, + transform=transform_fn, + # pipeline=pipeline, + tensors_dict=tensors_dict, + tensors=tensors, + mode=mode, + num_gpus=loader_config["num_gpus"], + batch_size=batch_size, + ) + + if dist: + rank, world_size = get_dist_info() + sl = get_indexes( + dataset, rank=rank, num_replicas=world_size, drop_last=drop_last + ) + dataset = dataset.query( + f"select * LIMIT {sl.stop - sl.start} OFFSET {sl.start}" + ) + + pytorch_ds = MMSegTorchDataset(dataset, transform=transform_fn) + pytorch_ds.mmseg_dataset = mmseg_ds + + init_fn = ( + partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) + if seed is not None + else None + ) + + if digit_version(torch.__version__) >= digit_version("1.8.0"): + loader = DataLoader( + pytorch_ds, + batch_size=batch_size, + sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + persistent_workers=persistent_workers, + ) + else: + loader = DataLoader( + pytorch_ds, + batch_size=batch_size, + sampler=None, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + ) + + eval_fn = partial(mmseg_subiterable_dataset_eval, loader) + loader.dataset.evaluate = eval_fn + + return loader diff --git a/python/deeplake/integrations/mmseg/mmseg_dataset_.py b/python/deeplake/integrations/mmseg/mmseg_dataset_.py new file mode 100644 index 0000000000..494f730b57 --- /dev/null +++ b/python/deeplake/integrations/mmseg/mmseg_dataset_.py @@ -0,0 +1,239 @@ +from collections import OrderedDict +import math +import numpy as np + +from typing import Optional, Callable, Sequence +from torch.utils.data import Dataset +from prettytable import PrettyTable # type: ignore + +import mmcv +from mmcv.utils import print_log +from mmseg.core import eval_metrics, intersect_and_union, pre_eval_to_metrics + +from deeplake.integrations.mm.exceptions import InvalidImageError, InvalidSegmentError +from deeplake.integrations.mm.upcast_array import upcast_array +import time + + +class MMSegTorchDataset(Dataset): + def __init__( + self, + dataset, + tensors=None, + transform: Optional[Callable] = None, + ) -> None: + super().__init__() + self.dataset = dataset + self.transform = transform + self.column_names = [col.name for col in self.dataset.schema.columns] + self.last_successful_index = -1 + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + while True: + try: + sample = self.dataset[idx] + result = None + if self.transform: + result = self.transform(sample) + else: + out = {} + for col in self.column_names: + out[col] = sample[col] + result = out + self.last_successful_index = idx + return result + except (InvalidImageError, InvalidSegmentError) as e: + print(f"Error processing data at index {idx}: {e}") + if self.last_successful_index == -1: + self.last_successful_index = idx + 1 + idx = self.last_successful_index + continue + + +class MMSegDataset(MMSegTorchDataset): + def __init__( + self, + *args, + tensors_dict, + mode="train", + num_gpus=1, + batch_size=1, + ignore_index=255, + reduce_zero_label=False, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.mode = mode + self.num_gpus = num_gpus + self.batch_size = batch_size + self.ignore_index = ignore_index + self.reduce_zero_label = reduce_zero_label + self.masks_tensor_name = tensors_dict["masks_tensor"] + if self.mode in ("val", "test"): + self.CLASSES = self.get_classes(tensors_dict["masks_tensor"])[:] + + def __len__(self): + if self.mode == "val": + per_gpu_length = math.floor( + len(self.dataset) / (self.batch_size * self.num_gpus) + ) + total_length = per_gpu_length * self.num_gpus + return total_length + return super().__len__() + + def _get_masks(self, masks_tensor): + if masks_tensor is None: + return [] + return self.dataset[masks_tensor] + + def get_classes(self, classes): + """Get class names of current dataset. + + Args: + classes (str): Reresents the name of the classes tensor. Overrides the CLASSES defined by the dataset. + + Returns: + list[str]: Names of categories of the dataset. + """ + return self.dataset[classes].metadata["class_names"] + + def get_gt_seg_maps(self, efficient_test=None): + """Get ground truth segmentation maps for evaluation.""" + if efficient_test is not None: + warnings.warn( + "DeprecationWarning: ``efficient_test`` has been deprecated " + "since MMSeg v0.16, the ``get_gt_seg_maps()`` is CPU memory " + "friendly by default. " + ) + + mask_col = self._get_masks(self.masks_tensor_name) + last_successful_index = -1 + for idx in range(len(self)): + try: + result = upcast_array(mask_col[idx]) + last_successful_index = idx + yield result + except Exception as e: + print(f"Error processing mask at index {idx}: {e}") + if last_successful_index == -1: + continue + else: + yield upcast_array(mask_col[last_successful_index]) + + def evaluate(self, results, metric="mIoU", logger=None, gt_seg_maps=None, **kwargs): + """Evaluate the dataset. + + Args: + results (list[tuple[torch.Tensor]] | list[str]): per image pre_eval + results or predict segmentation map for computing evaluation + metric. + metric (str | list[str]): Metrics to be evaluated. 'mIoU', + 'mDice' and 'mFscore' are supported. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + gt_seg_maps (generator[ndarray]): Custom gt seg maps as input, + used in ConcatDataset + + .. + # noqa: DAR101 + + Raises: + KeyError: if a specified metric format is not supported + + Returns: + dict[str, float]: Default metrics. + """ + + if self.num_gpus > 1: + results_ordered = [] + for i in range(self.num_gpus): + results_ordered += results[i :: self.num_gpus] + results = results_ordered + + if isinstance(metric, str): + metric = [metric] + allowed_metrics = ["mIoU", "mDice", "mFscore"] + if not set(metric).issubset(set(allowed_metrics)): + raise KeyError("metric {} is not supported".format(metric)) + + eval_results = {} + # test a list of files + if mmcv.is_list_of(results, np.ndarray) or mmcv.is_list_of(results, str): + if gt_seg_maps is None: + gt_seg_maps = self.get_gt_seg_maps() + num_classes = len(self.CLASSES) + ret_metrics = eval_metrics( + results, + gt_seg_maps, + num_classes, + self.ignore_index, + metric, + label_map=dict(), + reduce_zero_label=self.reduce_zero_label, + ) + # test a list of pre_eval_results + else: + ret_metrics = pre_eval_to_metrics(results, metric) + + # Because dataset.CLASSES is required for per-eval. + if self.CLASSES is None: + class_names = tuple(range(num_classes)) + else: + class_names = self.CLASSES + + # summary table + ret_metrics_summary = OrderedDict( + { + ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + } + ) + + # each class table + ret_metrics.pop("aAcc", None) + ret_metrics_class = OrderedDict( + { + ret_metric: np.round(ret_metric_value * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + } + ) + ret_metrics_class.update({"Class": class_names}) + ret_metrics_class.move_to_end("Class", last=False) + + # for logger + class_table_data = PrettyTable() + for key, val in ret_metrics_class.items(): + class_table_data.add_column(key, val) + + summary_table_data = PrettyTable() + for key, val in ret_metrics_summary.items(): + if key == "aAcc": + summary_table_data.add_column(key, [val]) + else: + summary_table_data.add_column("m" + key, [val]) + + print_log("per class results:", logger) + print_log("\n" + class_table_data.get_string(), logger=logger) + print_log("Summary:", logger) + print_log("\n" + summary_table_data.get_string(), logger=logger) + + # each metric dict + for key, value in ret_metrics_summary.items(): + if key == "aAcc": + eval_results[key] = value / 100.0 + else: + eval_results["m" + key] = value / 100.0 + + ret_metrics_class.pop("Class", None) + for key, value in ret_metrics_class.items(): + eval_results.update( + { + key + "." + str(name): value[idx] / 100.0 + for idx, name in enumerate(class_names) + } + ) + + return eval_results diff --git a/python/deeplake/integrations/mmseg/test_.py b/python/deeplake/integrations/mmseg/test_.py new file mode 100644 index 0000000000..fb6e1cc526 --- /dev/null +++ b/python/deeplake/integrations/mmseg/test_.py @@ -0,0 +1,245 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings + +import mmcv +import numpy as np +import torch +from mmcv.engine import collect_results_cpu, collect_results_gpu +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + + +def np2tmp(array, temp_file_name=None, tmpdir=None): + """Save ndarray to local numpy file. + + Args: + array (ndarray): Ndarray to save. + temp_file_name (str): Numpy file name. If 'temp_file_name=None', this + function will generate a file name with tempfile.NamedTemporaryFile + to save ndarray. Default: None. + tmpdir (str): Temporary directory to save Ndarray files. Default: None. + Returns: + str: The numpy file name. + """ + + if temp_file_name is None: + temp_file_name = tempfile.NamedTemporaryFile( + suffix=".npy", delete=False, dir=tmpdir + ).name + np.save(temp_file_name, array) + return temp_file_name + + +def single_gpu_test( + model, + data_loader, + show=False, + out_dir=None, + efficient_test=False, + opacity=0.5, + pre_eval=False, + format_only=False, + format_args={}, +): + """Test with single GPU by progressive mode. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + show (bool): Whether show results during inference. Default: False. + out_dir (str, optional): If specified, the results will be dumped into + the directory to save output results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Mutually exclusive with + pre_eval and format_results. Default: False. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + pre_eval (bool): Use dataset.pre_eval() function to generate + pre_results for metric evaluation. Mutually exclusive with + efficient_test and format_results. Default: False. + format_only (bool): Only format result for results commit. + Mutually exclusive with pre_eval and efficient_test. + Default: False. + format_args (dict): The args for format_results. Default: {}. + Returns: + list: list of evaluation pre-results or list of save file names. + """ + if efficient_test: + warnings.warn( + "DeprecationWarning: ``efficient_test`` will be deprecated, the " + "evaluation is CPU memory friendly with pre_eval=True" + ) + mmcv.mkdir_or_exist(".efficient_test") + # when none of them is set true, return segmentation results as + # a list of np.array. + assert [efficient_test, pre_eval, format_only].count(True) <= 1, ( + "``efficient_test``, ``pre_eval`` and ``format_only`` are mutually " + "exclusive, only one of them could be true ." + ) + + model.eval() + results = [] + dataset = data_loader.dataset.mmseg_dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + # The pipeline about how the data_loader retrieval samples from dataset: + # sampler -> batch_sampler -> indices + # The indices are passed to dataset_fetcher to get data from dataset. + # data_fetcher -> collate_fn(dataset[index]) -> data_sample + # we use batch_sampler to get correct data idx + loader_indices = data_loader.batch_sampler + + for batch_indices, data in zip(loader_indices, data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + + if show or out_dir: + img_tensor = data["img"][0] + img_metas = data["img_metas"][0].data[0] + imgs = tensor2imgs(img_tensor, **img_metas[0]["img_norm_cfg"]) + assert len(imgs) == len(img_metas) + + for img, img_meta in zip(imgs, img_metas): + h, w, _ = img_meta["img_shape"] + img_show = img[:h, :w, :] + + ori_h, ori_w = img_meta["ori_shape"][:-1] + img_show = mmcv.imresize(img_show, (ori_w, ori_h)) + + if out_dir: + out_file = osp.join(out_dir, img_meta["ori_filename"]) + else: + out_file = None + + model.module.show_result( + img_show, + result, + palette=dataset.PALETTE, + show=show, + out_file=out_file, + opacity=opacity, + ) + + if efficient_test: + result = [np2tmp(_, tmpdir=".efficient_test") for _ in result] + + if format_only: + result = dataset.format_results( + result, indices=batch_indices, **format_args + ) + if pre_eval: + # TODO: adapt samples_per_gpu > 1. + # only samples_per_gpu=1 valid now + result = dataset.pre_eval(result, indices=batch_indices) + results.extend(result) + else: + results.extend(result) + + batch_size = len(result) + for _ in range(batch_size): + prog_bar.update() + + return results + + +def multi_gpu_test( + model, + data_loader, + tmpdir=None, + gpu_collect=False, + efficient_test=False, + pre_eval=False, + format_only=False, + format_args={}, +): + """Test model with multiple gpus by progressive mode. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. The same path is used for efficient + test. Default: None. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Mutually exclusive with + pre_eval and format_results. Default: False. + pre_eval (bool): Use dataset.pre_eval() function to generate + pre_results for metric evaluation. Mutually exclusive with + efficient_test and format_results. Default: False. + format_only (bool): Only format result for results commit. + Mutually exclusive with pre_eval and efficient_test. + Default: False. + format_args (dict): The args for format_results. Default: {}. + + Returns: + list: list of evaluation pre-results or list of save file names. + """ + if efficient_test: + warnings.warn( + "DeprecationWarning: ``efficient_test`` will be deprecated, the " + "evaluation is CPU memory friendly with pre_eval=True" + ) + mmcv.mkdir_or_exist(".efficient_test") + # when none of them is set true, return segmentation results as + # a list of np.array. + assert [efficient_test, pre_eval, format_only].count(True) <= 1, ( + "``efficient_test``, ``pre_eval`` and ``format_only`` are mutually " + "exclusive, only one of them could be true ." + ) + + model.eval() + results = [] + dataset = data_loader.dataset.mmseg_dataset + # The pipeline about how the data_loader retrieval samples from dataset: + # sampler -> batch_sampler -> indices + # The indices are passed to dataset_fetcher to get data from dataset. + # data_fetcher -> collate_fn(dataset[index]) -> data_sample + # we use batch_sampler to get correct data idx + + # batch_sampler based on DistributedSampler, the indices only point to data + # samples of related machine. + loader_indices = data_loader.batch_sampler + + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + + for batch_indices, data in zip(loader_indices, data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + + if efficient_test: + result = [np2tmp(_, tmpdir=".efficient_test") for _ in result] + + if format_only: + result = dataset.format_results( + result, indices=batch_indices, **format_args + ) + if pre_eval: + # TODO: adapt samples_per_gpu > 1. + # only samples_per_gpu=1 valid now + result = dataset.pre_eval(result, indices=batch_indices) + + results.extend(result) + + if rank == 0: + batch_size = len(result) * world_size + for _ in range(batch_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results diff --git a/python/deeplake/schemas.pyi b/python/deeplake/schemas.pyi index 0e37e0c2c6..a18c5fc3ec 100644 --- a/python/deeplake/schemas.pyi +++ b/python/deeplake/schemas.pyi @@ -12,30 +12,39 @@ def TextEmbeddings(embedding_size: int, quantize: bool = False) -> SchemaTemplat """ A schema for storing embedded text from documents. - - id (uint64) - - chunk_index (uint16) Position of the text_chunk within the document - - document_id (uint64) Unique identifier for the document the embedding came from - - date_created (uint64) Timestamp the document was read - - text_chunk (text) The text of the shard - - embedding (dtype=float32, size=embedding_size) The embedding of the text + This schema includes the following fields: + - id (uint64): Unique identifier for each entry. + - chunk_index (uint16): Position of the text chunk within the document. + - document_id (uint64): Unique identifier for the document the embedding came from. + - date_created (uint64): Timestamp when the document was read. + - text_chunk (text): The text of the shard. + - embedding (dtype=float32, size=embedding_size): The embedding of the text. Parameters: - embedding_size: Size of the embeddings - quantize: If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed + embedding_size: int + Size of the embeddings. + quantize: bool, optional + If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed. Default is False. Examples: + Create a dataset with the standard schema: ```python - # Create a dataset with the standard schema - ds = deeplake.create("ds_path", - schema=deeplake.schemas.TextEmbeddings(768).build()) - - # Customize the schema before creating the dataset - ds = deeplake.create("ds_path", schema=deeplake.schemas.TextEmbeddings(768) - .rename("embedding", "text_embed") - .add("author", types.Text()) - .build()) + ds = deeplake.create("tmp://", schema=deeplake.schemas.TextEmbeddings(768)) ``` + Customize the schema before creating the dataset: + ```python + ds = deeplake.create("tmp://", schema=deeplake.schemas.TextEmbeddings(768) + .rename("embedding", "text_embed") + .add("author", types.Text())) + ``` + + Add a new field to the schema: + ```python + schema = deeplake.schemas.TextEmbeddings(768) + schema.add("language", types.Text()) + ds = deeplake.create("tmp://", schema=schema) + ``` """ ... @@ -49,59 +58,101 @@ def COCOImages( """ A schema for storing COCO-based image data. - - id (uint64) - - image (jpg image) - - url (text) - - year (uint8) - - version (text) - - description (text) - - contributor (text) - - date_created (uint64) - - date_captured (uint64) - - embedding (embedding) - - license (text) - - is_crowd (bool) + This schema includes the following fields: + - id (uint64): Unique identifier for each entry. + - image (jpg image): The image data. + - url (text): URL of the image. + - year (uint8): Year the image was captured. + - version (text): Version of the dataset. + - description (text): Description of the image. + - contributor (text): Contributor of the image. + - date_created (uint64): Timestamp when the image was created. + - date_captured (uint64): Timestamp when the image was captured. + - embedding (embedding): Embedding of the image. + - license (text): License information. + - is_crowd (bool): Whether the image contains a crowd. If `objects` is true, the following fields are added: - - objects_bbox (bounding box) - - objects_classes (segment mask) + - objects_bbox (bounding box): Bounding boxes for objects. + - objects_classes (segment mask): Segment masks for objects. If `keypoints` is true, the following fields are added: - - keypoints_bbox (bounding box) - - keypoints_classes (segment mask) - - keypoints (2-dimensional array of uint32) - - keypoints_skeleton (2-dimensional array of uint16) + - keypoints_bbox (bounding box): Bounding boxes for keypoints. + - keypoints_classes (segment mask): Segment masks for keypoints. + - keypoints (2-dimensional array of uint32): Keypoints data. + - keypoints_skeleton (2-dimensional array of uint16): Skeleton data for keypoints. - if `stuffs` is true, the following fields are added: - - stuffs_bbox (bounding boxes) - - stuffs_classes (segment mask) + If `stuffs` is true, the following fields are added: + - stuffs_bbox (bounding boxes): Bounding boxes for stuffs. + - stuffs_classes (segment mask): Segment masks for stuffs. Parameters: - embedding_size: Size of the embeddings - quantize: If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed + embedding_size: int + Size of the embeddings. + quantize: bool, optional + If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed. Default is False. + objects: bool, optional + Whether to include object-related fields. Default is True. + keypoints: bool, optional + Whether to include keypoint-related fields. Default is False. + stuffs: bool, optional + Whether to include stuff-related fields. Default is False. Examples: + Create a dataset with the standard schema: ```python - # Create a dataset with the standard schema - ds = deeplake.create("ds_path", - schema=deeplake.schemas.COCOImages(768).build()) + ds = deeplake.create("tmp://", schema=deeplake.schemas.COCOImages(768)) + ``` - # Customize the schema before creating the dataset - ds = deeplake.create("ds_path", schema=deeplake.schemas.COCOImages(768, - objects=True, keypoints=True) + Customize the schema before creating the dataset: + ```python + ds = deeplake.create("tmp://", schema=deeplake.schemas.COCOImages(768, objects=True, keypoints=True) .rename("embedding", "image_embed") - .add("author", types.Text()).build()) + .add("author", types.Text())) ``` + Add a new field to the schema: + ```python + schema = deeplake.schemas.COCOImages(768) + schema.add("location", types.Text()) + ds = deeplake.create("tmp://", schema=schema) + ``` """ ... class SchemaTemplate: """ - A template that can be used for creating a new dataset with [deeplake.create][] + A template that can be used for creating a new dataset with [deeplake.create][]. + + This class allows you to define and customize the schema for your dataset. + + Parameters: + schema: dict + A dictionary where the key is the column name and the value is the data type. + + Methods: + add(name: str, dtype: deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type) -> SchemaTemplate: + Adds a new column to the template. + remove(name: str) -> SchemaTemplate: + Removes a column from the template. + rename(old_name: str, new_name: str) -> SchemaTemplate: + Renames a column in the template. + + Examples: + Create a new schema template, modify it, and create a dataset with the schema: + ```python + schema = deeplake.schemas.SchemaTemplate({ + "id": types.UInt64(), + "text": types.Text(), + "embedding": types.Embedding(768) + }) + schema.add("author", types.Text()) + schema.remove("text") + schema.rename("embedding", "text_embedding") + ds = deeplake.create("tmp://", schema=schema) + ``` """ - # Temporary workaround. Need to remove `deeplake._deeplake` from the return type. def __init__( self, schema: dict[ @@ -109,9 +160,9 @@ class SchemaTemplate: ], ) -> None: """ - Constructs a new SchemaTemplate from the given dict + Constructs a new SchemaTemplate from the given dict. """ - ... + # ...existing code... def add( self, @@ -119,20 +170,47 @@ class SchemaTemplate: dtype: deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type, ) -> SchemaTemplate: """ - Adds a new column to the template + Adds a new column to the template. Parameters: - name: The column name - dtype: The column data type + name: str + The column name. + dtype: deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type + The column data type. + + Returns: + SchemaTemplate: The updated schema template. + + Examples: + Add a new column to the schema: + ```python + schema = deeplake.schemas.SchemaTemplate({}) + schema.add("author", types.Text()) + ``` """ ... def remove(self, name: str) -> SchemaTemplate: """ - Removes a column from the template + Removes a column from the template. Parameters: - name: The column name + name: str + The column name. + + Returns: + SchemaTemplate: The updated schema template. + + Examples: + Remove a column from the schema: + ```python + schema = deeplake.schemas.SchemaTemplate({ + "id": types.UInt64(), + "text": types.Text(), + "embedding": types.Embedding(768) + }) + schema.remove("text") + ``` """ ... @@ -141,7 +219,23 @@ class SchemaTemplate: Renames a column in the template. Parameters: - old_name: Existing column name - new_name: New column name + old_name: str + Existing column name. + new_name: str + New column name. + + Returns: + SchemaTemplate: The updated schema template. + + Examples: + Rename a column in the schema: + ```python + schema = deeplake.schemas.SchemaTemplate({ + "id": types.UInt64(), + "text": types.Text(), + "embedding": types.Embedding(768) + }) + schema.rename("embedding", "text_embedding") + ``` """ ... diff --git a/python/deeplake/tql.pyi b/python/deeplake/tql.pyi index 97f0d1e43c..fdb09e88c1 100644 --- a/python/deeplake/tql.pyi +++ b/python/deeplake/tql.pyi @@ -16,6 +16,15 @@ def register_function(function: typing.Callable) -> None: TQL interacts with Python functions through `numpy.ndarray`. The Python function to be used in TQL should accept input arguments as numpy arrays and return numpy array. + + Examples: ```python def next_number(a): diff --git a/python/deeplake/types.pyi b/python/deeplake/types.pyi index 896b198fc9..a0603be9d8 100644 --- a/python/deeplake/types.pyi +++ b/python/deeplake/types.pyi @@ -306,13 +306,15 @@ def Array(dtype: DataType | str, dimensions: int, shape: list[int]) -> DataType: DataType: A new array data type with the specified parameters. Examples: - Create a three-dimensional array, where each dimension can have any number of elements:: - - ds.add_column("col1", types.Array("int32", dimensions=3)) + Create a three-dimensional array, where each dimension can have any number of elements: + ```python + ds.add_column("col1", types.Array("int32", dimensions=3)) + ``` - Create a three-dimensional array, where each dimension has a known size:: - - ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) + Create a three-dimensional array, where each dimension has a known size: + ```python + ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) + ``` """ ... @@ -324,10 +326,11 @@ def Bool() -> DataType: DataType: A new boolean data type. Examples: - Create columns with boolean type:: - - ds.add_column("col1", types.Bool) - ds.add_column("col2", "bool") + Create columns with boolean type: + ```python + ds.add_column("col1", types.Bool) + ds.add_column("col2", "bool") + ``` """ ... @@ -349,13 +352,14 @@ def Text(index_type: str | TextIndexType | None = None) -> Type: Type: A new text data type. Examples: - Create text columns with different configurations:: - - ds.add_column("col1", types.Text) - ds.add_column("col2", "text") - ds.add_column("col3", str) - ds.add_column("col4", types.Text(index_type=types.Inverted)) - ds.add_column("col4", types.Text(index_type=types.BM25)) + Create text columns with different configurations: + ```python + ds.add_column("col1", types.Text) + ds.add_column("col2", "text") + ds.add_column("col3", str) + ds.add_column("col4", types.Text(index_type=types.Inverted)) + ds.add_column("col5", types.Text(index_type=types.BM25)) + ``` """ ... @@ -387,11 +391,12 @@ def Dict() -> Type: :func:`deeplake.types.Struct` for a type that supports defining allowed keys. Examples: - Create and use a dictionary column:: - - ds.add_column("col1", types.Dict) - ds.append([{"col1": {"a": 1, "b": 2}}]) - ds.append([{"col1": {"b": 3, "c": 4}}]) + Create and use a dictionary column: + ```python + ds.add_column("col1", types.Dict) + ds.append([{"col1": {"a": 1, "b": 2}}]) + ds.append([{"col1": {"b": 3, "c": 4}}]) + ``` """ ... @@ -419,10 +424,11 @@ def Embedding( :func:`deeplake.types.Array` for a multidimensional array. Examples: - Create embedding columns:: - - ds.add_column("col1", types.Embedding(768)) - ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) + Create embedding columns: + ```python + ds.add_column("col1", types.Embedding(768)) + ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) + ``` """ ... @@ -434,9 +440,10 @@ def Float32() -> DataType: DataType: A new 32-bit float data type. Examples: - Create a column with 32-bit float type:: - - ds.add_column("col1", types.Float32) + Create a column with 32-bit float type: + ```python + ds.add_column("col1", types.Float32) + ``` """ ... @@ -448,9 +455,10 @@ def Float64() -> DataType: DataType: A new 64-bit float data type. Examples: - Create a column with 64-bit float type:: - - ds.add_column("col1", types.Float64) + Create a column with 64-bit float type: + ```python + ds.add_column("col1", types.Float64) + ``` """ ... @@ -462,9 +470,10 @@ def Int16() -> DataType: DataType: A new 16-bit integer data type. Examples: - Create a column with 16-bit integer type:: - - ds.add_column("col1", types.Int16) + Create a column with 16-bit integer type: + ```python + ds.add_column("col1", types.Int16) + ``` """ ... @@ -476,9 +485,10 @@ def Int32() -> DataType: DataType: A new 32-bit integer data type. Examples: - Create a column with 32-bit integer type:: - - ds.add_column("col1", types.Int32) + Create a column with 32-bit integer type: + ```python + ds.add_column("col1", types.Int32) + ``` """ ... @@ -490,9 +500,10 @@ def Int64() -> DataType: DataType: A new 64-bit integer data type. Examples: - Create a column with 64-bit integer type:: - - ds.add_column("col1", types.Int64) + Create a column with 64-bit integer type: + ```python + ds.add_column("col1", types.Int64) + ``` """ ... @@ -504,9 +515,10 @@ def Int8() -> DataType: DataType: A new 8-bit integer data type. Examples: - Create a column with 8-bit integer type:: - - ds.add_column("col1", types.Int8) + Create a column with 8-bit integer type: + ```python + ds.add_column("col1", types.Int8) + ``` """ ... @@ -526,9 +538,10 @@ def Sequence(nested_type: DataType | str | Type) -> Type: Type: A new sequence data type. Examples: - Create a sequence of images:: - - ds.add_column("col1", types.Sequence(types.Image(sample_ + Create a sequence of images: + ```python + ds.add_column("col1", types.Sequence(types.Image(sample_compression="jpg"))) + ``` """ def Image(dtype: DataType | str = "uint8", sample_compression: str = "png") -> Type: @@ -554,7 +567,7 @@ def Image(dtype: DataType | str = "uint8", sample_compression: str = "png") -> T Examples: ```python ds.add_column("col1", types.Image) - ds.add_column("col1", types.Image(sample_compression="jpg")) + ds.add_column("col2", types.Image(sample_compression="jpg")) ``` """ ... @@ -615,7 +628,7 @@ def BinaryMask( Examples: ```python ds.add_column("col1", types.BinaryMask(sample_compression="lz4")) - ds.append(np.zeros((512, 512, 5), dtype="bool")) + ds.append([{"col1": np.zeros((512, 512, 5), dtype="bool")}]) ``` """ ... @@ -637,7 +650,7 @@ def SegmentMask( Examples: ```python ds.add_column("col1", types.SegmentMask(sample_compression="lz4")) - ds.append("col1", np.zeros((512, 512))) + ds.append([{"col1": np.zeros((512, 512, 3))}]) ``` """ ... @@ -655,14 +668,12 @@ def Struct(fields: dict[str, DataType | str]) -> DataType: ```python ds.add_column("col1", types.Struct({ "field1": types.Int16(), - "field2": types.Text(), + "field2": "text", })) ds.append([{"col1": {"field1": 3, "field2": "a"}}]) print(ds[0]["col1"]["field1"]) # Output: 3 ``` - - """ ...