Merge pull request #5 from sumeshi/feature/v0.3.3

Feature/v0.3.3
sumeshi · Nov 6, 2024 · cadd278 · cadd278
2 parents 3f11623 + 3125dcb
commit cadd278
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ shape: (3, 5)
 Loads the specified CSV files.
 ```
 Arguments:
-  path*: str
+  *path: tuple[str]
 ```
 
 examples
@@ -102,13 +102,46 @@ Filter rows containing the specified regex.
 Arguments:
   colname: str
   regex: str
+  ignorecase: bool = False
 ```
 
 examples
 ```
 $ qsv load ./Security.csv - contains 'Date and Time' '10/6/2016'
 ```
 
+#### sed
+Replace values by specified regex.
+
+```
+Arguments:
+  colname: str
+  regex: str
+  replaced_text: str
+  ignorecase: bool = False
+```
+
+examples
+```
+$ qsv load ./Security.csv - sed 'Date and Time' '/' '-'
+```
+
+#### grep
+Treats all cols as strings and filters only matched cols by searching with the specified regex.
+
+This function is similar to running a grep command leaving the HEADER.
+
+```
+Arguments:
+  regex: str
+  ignorecase: bool = False
+```
+
+examples
+```
+$ qsv load ./Security.csv - grep 'LogonType'
+```
+
 #### head
 Filters only the specified number of lines from the first line.
 
@@ -140,7 +173,7 @@ Sorts all rows by the specified column values.
 
 ```
 Arguments:
-  colnames: Union[str, tuple[str]]
+  colnames: Union[str, tuple[str], list[str]]
 
 Options:
   desc: bool = False
@@ -156,7 +189,7 @@ Remove duplicated rows by the specified column names.
 
 ```
 Arguments:
-  colnames: Union[str, list[str]]
+  colnames: Union[str, tuple[str], list[str]]
 ```
 
 examples
@@ -167,6 +200,8 @@ $ qsv load ./Security.csv - uniq 'Event ID'
 #### changetz
 Changes the timezone of the specified date column.
 
+The method of writing datetime format is the same as in [python](https://docs.python.org/ja/3/library/datetime.html) (1989 C Standard).
+
 ```
 Arguments:
   colname: str
@@ -179,7 +214,21 @@ Options:
 
 examples
 ```
-$ qsv load ./Security.csv - changetz 'Date and Time' --timezone_from=UTC --timezone_to=Asia/Tokyo --new_colname='Date and Time(JST)'
+$ qsv load ./Security.csv - changetz 'Date and Time' --timezone_from=UTC --timezone_to=Asia/Tokyo --datetime_format="%m/%d/%Y %I:%M:%S %p"
+```
+
+#### renamecol
+Rename specified column name.
+
+```
+Arguments:
+  colname: str
+  new_colname: str
+```
+
+examples
+```
+$ qsv load ./Security.csv - renamecol 'Event ID' 'EventID'
 ```
 
 ### Finalizer
@@ -295,6 +344,12 @@ e.g
 $ qsv quilt rules ./Security.csv
 ```
 
+```
+Arguments:
+  config: str
+  *path: tuple[str]
+```
+
 rules/test.yaml
 ```yaml
 title: test

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "qsv"
-version = "0.3.2"
+version = "0.3.3"
 description = "A tool designed for rapid CSV file processing and filtering, specifically designed for log analysis."
 readme = "README.md"
 authors = [

diff --git a/src/qsv/controllers/DataFrameController.py b/src/qsv/controllers/DataFrameController.py
@@ -1,8 +1,9 @@
 import re
 import sys
 import logging
-from datetime import datetime
 from typing import Union
+from datetime import datetime
+from pathlib import Path
 
 from qsv.controllers.CsvController import CsvController
 from qsv.controllers.QuiltController import QuiltController
@@ -18,9 +19,23 @@
 class DataFrameController(object):
     def __init__(self):
         self.df = None
+
+    # -- private methods --
+    def __check_exists_path(self, path: tuple[str]) -> None:
+        for p in path:
+            if not Path(p).exists():
+                print(f"[Error] File \"{p}\" does not exist.")
+                sys.exit(1)
+
+    def __check_exists_colnames(self, colnames: list[str]) -> None:
+        columns = self.df.collect_schema().names()
+        for colname in colnames:
+            if colname not in columns:
+                print(f"[Error] Column \"{colname}\" does not exist in the dataframe.")
+                sys.exit(1)
 
     # -- quilter --
-    def quilt(self, config: str, *path: str):
+    def quilt(self, config: str, *path: tuple[str]) -> None:
         """[quilter] Loads the specified quilt batch files."""
         logger.debug(f"config: {config}")
         logger.debug(f"{len(path)} files are loaded. [{', '.join(path)}]")
@@ -38,9 +53,10 @@ def quilt(self, config: str, *path: str):
                     getattr(self, k)()
 
     # -- initializer --
-    def load(self, *path: str):
+    def load(self, *path: tuple[str]):
         """[initializer] Loads the specified CSV files."""
         logger.debug(f"{len(path)} files are loaded. [{', '.join(path)}]")
+        self.__check_exists_path(path)
         self.df = CsvController(path=path).get_dataframe()
         return self
 
@@ -71,31 +87,51 @@ def parse_columns(headers: list[str], colnames: tuple[str]):
             colnames = tuple(colnames)
         elif type(colnames) is str:
             colnames = (colnames, )
-
+        self.__check_exists_colnames(colnames)
         selected_columns = parse_columns(headers=self.df.collect_schema().names(), colnames=colnames)
-
         logger.debug(f"{len(selected_columns)} columns are selected. {', '.join(selected_columns)}")
         self.df = self.df.select(selected_columns)
         return self
 
     def isin(self, colname: str, values: list):
         """[chainable] Filter rows containing the specified values."""
         logger.debug(f"filter condition: {values} in {colname}")
+        self.__check_exists_colnames([colname])
         self.df = self.df.filter(pl.col(colname).is_in(values))
         return self
 
-    def contains(self, colname: str, regex: str):
+    def contains(self, colname: str, regex: str, ignorecase: bool = False):
         """[chainable] Filter rows containing the specified regex."""
         logger.debug(f"filter condition: {regex} contains {colname}")
+        self.__check_exists_colnames([colname])
         regex = regex if type(regex) is str else str(regex)
-        self.df = self.df.filter(pl.col(colname).str.contains(regex))
+        self.df = self.df.filter(
+            pl.col(colname).str.contains(f"(?i){regex}") if ignorecase else pl.col(colname).str.contains(regex)
+        )
         return self
 
-    def sed(self, colname: str, regex: str, replaced_text: str):
+    def sed(self, colname: str, regex: str, replaced_text: str, ignorecase: bool = False):
         """[chainable] Replace values by specified regex."""
         logger.debug(f"sed condition: {regex} on {colname}")
+        self.__check_exists_colnames([colname])
         regex = regex if type(regex) is str else str(regex)
-        self.df = self.df.with_columns(pl.col(colname).cast(pl.String).str.replace(regex, replaced_text))
+        self.df = self.df.with_columns(
+            pl.col(colname).cast(pl.String).str.replace(f"(?i){regex}", replaced_text) if ignorecase else pl.col(colname).cast(pl.String).str.replace(regex, replaced_text)
+        )
+        return self
+
+    def grep(self, regex: str, ignorecase: bool = False):
+        """[chainable] Treats all cols as strings and filters only matched cols by searching with the specified regex"""
+        self.df = self.df.with_columns(
+            pl.concat_str(
+                [pl.col(colname).cast(pl.String).fill_null("") for colname in self.df.collect_schema().names()],
+                separator=","
+            ).alias('___combined')
+        )
+        self.df = self.df.filter(
+            pl.col('___combined').str.contains(f"(?i){regex}") if ignorecase else pl.col('___combined').str.contains(regex)
+        )
+        self.df = self.df.drop(['___combined'])
         return self
 
     def head(self, number: int = 5):
@@ -110,9 +146,16 @@ def tail(self, number: int = 5):
         self.df = self.df.tail(number)
         return self
 
-    def sort(self, colnames: str, desc: bool = False):
+    def sort(self, colnames: Union[str, tuple[str], list[str]], desc: bool = False):
         """[chainable] Sorts all rows by the specified column values."""
         logger.debug(f"sort by {colnames} ({'desc' if desc else 'asc'}).")
+        # prevent type guessing
+        colnames: tuple[str]
+        if type(colnames) is list:
+            colnames = tuple(colnames)
+        elif type(colnames) is str:
+            colnames = (colnames, )
+        self.__check_exists_colnames(colnames)
         self.df = self.df.sort(colnames, descending=desc)
         return self
 
@@ -125,7 +168,7 @@ def uniq(self, colnames: Union[str, tuple[str], list[str]]):
             colnames = tuple(colnames)
         elif type(colnames) is str:
             colnames = (colnames, )
-
+        self.__check_exists_colnames(colnames)
         self.df = self.df.unique(subset=colnames)
         return self
 
@@ -138,6 +181,7 @@ def changetz(
         ):
         """[chainable] Changes the timezone of the specified date column."""
         logger.debug(f"change {colname} timezone {timezone_from} to {timezone_to}.")
+        self.__check_exists_colnames([colname])
 
         if datetime_format:
             self.df = self.df.with_columns(pl.col(colname).str.to_datetime(datetime_format))
@@ -150,37 +194,39 @@ def changetz(
 
     def renamecol(self, colname: str, new_colname: str):
         """[chainable] Rename specified column name."""
+        self.__check_exists_colnames([colname])
         self.df = self.df.rename({colname: new_colname})
         return self
 
     # -- finalizer --
-    def headers(self, plain: bool = False):
+    def headers(self, plain: bool = False) -> None:
         """[finalizer] Displays the column names of the data."""
         if plain:
-            print(",".join([f"\"{c}\"" for c in self.df.columns]))
+            print(",".join([f"\"{c}\"" for c in self.df.collect_schema().names()]))
         else:
+            digits = len(str(len(self.df.collect_schema().names())))
             TableView.print(
                 headers=["#", "Column Name"],
-                values=[[str(i).zfill(2), c] for i, c in enumerate(self.df.columns)]
+                values=[[str(i).zfill(digits), c] for i, c in enumerate(self.df.collect_schema().names())]
             )
 
-    def stats(self):
+    def stats(self) -> None:
         """[finalizer] Displays the statistical information of the data."""
         print(self.df.describe())       
 
-    def showquery(self):
+    def showquery(self) -> None:
         """[finalizer] Displays the data processing query."""
         print(self.df)
 
-    def show(self):
+    def show(self) -> None:
         """[finalizer] Outputs the processing results to the standard output."""
         self.df.collect().write_csv(sys.stdout)
 
-    def showtable(self):
+    def showtable(self) -> None:
         """[finalizer] Outputs the processing results table to the standard output."""
         print(self.df.collect())
 
-    def dump(self, path: str = None):
+    def dump(self, path: str = None) -> None:
         """[finalizer] Outputs the processing results to a CSV file."""
         def autoname():
             now = datetime.now().strftime('%Y%m%d-%H%M%S')

diff --git a/uv.lock b/uv.lock