Skip to content

Commit

Permalink
Merge pull request #5 from sumeshi/feature/v0.3.3
Browse files Browse the repository at this point in the history
Feature/v0.3.3
  • Loading branch information
sumeshi authored Nov 6, 2024
2 parents 3f11623 + 3125dcb commit cadd278
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 25 deletions.
63 changes: 59 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ shape: (3, 5)
Loads the specified CSV files.
```
Arguments:
path*: str
*path: tuple[str]
```

examples
Expand Down Expand Up @@ -102,13 +102,46 @@ Filter rows containing the specified regex.
Arguments:
colname: str
regex: str
ignorecase: bool = False
```

examples
```
$ qsv load ./Security.csv - contains 'Date and Time' '10/6/2016'
```

#### sed
Replace values by specified regex.

```
Arguments:
colname: str
regex: str
replaced_text: str
ignorecase: bool = False
```

examples
```
$ qsv load ./Security.csv - sed 'Date and Time' '/' '-'
```

#### grep
Treats all cols as strings and filters only matched cols by searching with the specified regex.

This function is similar to running a grep command leaving the HEADER.

```
Arguments:
regex: str
ignorecase: bool = False
```

examples
```
$ qsv load ./Security.csv - grep 'LogonType'
```

#### head
Filters only the specified number of lines from the first line.

Expand Down Expand Up @@ -140,7 +173,7 @@ Sorts all rows by the specified column values.

```
Arguments:
colnames: Union[str, tuple[str]]
colnames: Union[str, tuple[str], list[str]]
Options:
desc: bool = False
Expand All @@ -156,7 +189,7 @@ Remove duplicated rows by the specified column names.

```
Arguments:
colnames: Union[str, list[str]]
colnames: Union[str, tuple[str], list[str]]
```

examples
Expand All @@ -167,6 +200,8 @@ $ qsv load ./Security.csv - uniq 'Event ID'
#### changetz
Changes the timezone of the specified date column.

The method of writing datetime format is the same as in [python](https://docs.python.org/ja/3/library/datetime.html) (1989 C Standard).

```
Arguments:
colname: str
Expand All @@ -179,7 +214,21 @@ Options:

examples
```
$ qsv load ./Security.csv - changetz 'Date and Time' --timezone_from=UTC --timezone_to=Asia/Tokyo --new_colname='Date and Time(JST)'
$ qsv load ./Security.csv - changetz 'Date and Time' --timezone_from=UTC --timezone_to=Asia/Tokyo --datetime_format="%m/%d/%Y %I:%M:%S %p"
```

#### renamecol
Rename specified column name.

```
Arguments:
colname: str
new_colname: str
```

examples
```
$ qsv load ./Security.csv - renamecol 'Event ID' 'EventID'
```

### Finalizer
Expand Down Expand Up @@ -295,6 +344,12 @@ e.g
$ qsv quilt rules ./Security.csv
```

```
Arguments:
config: str
*path: tuple[str]
```

rules/test.yaml
```yaml
title: test
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "qsv"
version = "0.3.2"
version = "0.3.3"
description = "A tool designed for rapid CSV file processing and filtering, specifically designed for log analysis."
readme = "README.md"
authors = [
Expand Down
84 changes: 65 additions & 19 deletions src/qsv/controllers/DataFrameController.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import sys
import logging
from datetime import datetime
from typing import Union
from datetime import datetime
from pathlib import Path

from qsv.controllers.CsvController import CsvController
from qsv.controllers.QuiltController import QuiltController
Expand All @@ -18,9 +19,23 @@
class DataFrameController(object):
def __init__(self):
self.df = None

# -- private methods --
def __check_exists_path(self, path: tuple[str]) -> None:
for p in path:
if not Path(p).exists():
print(f"[Error] File \"{p}\" does not exist.")
sys.exit(1)

def __check_exists_colnames(self, colnames: list[str]) -> None:
columns = self.df.collect_schema().names()
for colname in colnames:
if colname not in columns:
print(f"[Error] Column \"{colname}\" does not exist in the dataframe.")
sys.exit(1)

# -- quilter --
def quilt(self, config: str, *path: str):
def quilt(self, config: str, *path: tuple[str]) -> None:
"""[quilter] Loads the specified quilt batch files."""
logger.debug(f"config: {config}")
logger.debug(f"{len(path)} files are loaded. [{', '.join(path)}]")
Expand All @@ -38,9 +53,10 @@ def quilt(self, config: str, *path: str):
getattr(self, k)()

# -- initializer --
def load(self, *path: str):
def load(self, *path: tuple[str]):
"""[initializer] Loads the specified CSV files."""
logger.debug(f"{len(path)} files are loaded. [{', '.join(path)}]")
self.__check_exists_path(path)
self.df = CsvController(path=path).get_dataframe()
return self

Expand Down Expand Up @@ -71,31 +87,51 @@ def parse_columns(headers: list[str], colnames: tuple[str]):
colnames = tuple(colnames)
elif type(colnames) is str:
colnames = (colnames, )

self.__check_exists_colnames(colnames)
selected_columns = parse_columns(headers=self.df.collect_schema().names(), colnames=colnames)

logger.debug(f"{len(selected_columns)} columns are selected. {', '.join(selected_columns)}")
self.df = self.df.select(selected_columns)
return self

def isin(self, colname: str, values: list):
"""[chainable] Filter rows containing the specified values."""
logger.debug(f"filter condition: {values} in {colname}")
self.__check_exists_colnames([colname])
self.df = self.df.filter(pl.col(colname).is_in(values))
return self

def contains(self, colname: str, regex: str):
def contains(self, colname: str, regex: str, ignorecase: bool = False):
"""[chainable] Filter rows containing the specified regex."""
logger.debug(f"filter condition: {regex} contains {colname}")
self.__check_exists_colnames([colname])
regex = regex if type(regex) is str else str(regex)
self.df = self.df.filter(pl.col(colname).str.contains(regex))
self.df = self.df.filter(
pl.col(colname).str.contains(f"(?i){regex}") if ignorecase else pl.col(colname).str.contains(regex)
)
return self

def sed(self, colname: str, regex: str, replaced_text: str):
def sed(self, colname: str, regex: str, replaced_text: str, ignorecase: bool = False):
"""[chainable] Replace values by specified regex."""
logger.debug(f"sed condition: {regex} on {colname}")
self.__check_exists_colnames([colname])
regex = regex if type(regex) is str else str(regex)
self.df = self.df.with_columns(pl.col(colname).cast(pl.String).str.replace(regex, replaced_text))
self.df = self.df.with_columns(
pl.col(colname).cast(pl.String).str.replace(f"(?i){regex}", replaced_text) if ignorecase else pl.col(colname).cast(pl.String).str.replace(regex, replaced_text)
)
return self

def grep(self, regex: str, ignorecase: bool = False):
"""[chainable] Treats all cols as strings and filters only matched cols by searching with the specified regex"""
self.df = self.df.with_columns(
pl.concat_str(
[pl.col(colname).cast(pl.String).fill_null("") for colname in self.df.collect_schema().names()],
separator=","
).alias('___combined')
)
self.df = self.df.filter(
pl.col('___combined').str.contains(f"(?i){regex}") if ignorecase else pl.col('___combined').str.contains(regex)
)
self.df = self.df.drop(['___combined'])
return self

def head(self, number: int = 5):
Expand All @@ -110,9 +146,16 @@ def tail(self, number: int = 5):
self.df = self.df.tail(number)
return self

def sort(self, colnames: str, desc: bool = False):
def sort(self, colnames: Union[str, tuple[str], list[str]], desc: bool = False):
"""[chainable] Sorts all rows by the specified column values."""
logger.debug(f"sort by {colnames} ({'desc' if desc else 'asc'}).")
# prevent type guessing
colnames: tuple[str]
if type(colnames) is list:
colnames = tuple(colnames)
elif type(colnames) is str:
colnames = (colnames, )
self.__check_exists_colnames(colnames)
self.df = self.df.sort(colnames, descending=desc)
return self

Expand All @@ -125,7 +168,7 @@ def uniq(self, colnames: Union[str, tuple[str], list[str]]):
colnames = tuple(colnames)
elif type(colnames) is str:
colnames = (colnames, )

self.__check_exists_colnames(colnames)
self.df = self.df.unique(subset=colnames)
return self

Expand All @@ -138,6 +181,7 @@ def changetz(
):
"""[chainable] Changes the timezone of the specified date column."""
logger.debug(f"change {colname} timezone {timezone_from} to {timezone_to}.")
self.__check_exists_colnames([colname])

if datetime_format:
self.df = self.df.with_columns(pl.col(colname).str.to_datetime(datetime_format))
Expand All @@ -150,37 +194,39 @@ def changetz(

def renamecol(self, colname: str, new_colname: str):
"""[chainable] Rename specified column name."""
self.__check_exists_colnames([colname])
self.df = self.df.rename({colname: new_colname})
return self

# -- finalizer --
def headers(self, plain: bool = False):
def headers(self, plain: bool = False) -> None:
"""[finalizer] Displays the column names of the data."""
if plain:
print(",".join([f"\"{c}\"" for c in self.df.columns]))
print(",".join([f"\"{c}\"" for c in self.df.collect_schema().names()]))
else:
digits = len(str(len(self.df.collect_schema().names())))
TableView.print(
headers=["#", "Column Name"],
values=[[str(i).zfill(2), c] for i, c in enumerate(self.df.columns)]
values=[[str(i).zfill(digits), c] for i, c in enumerate(self.df.collect_schema().names())]
)

def stats(self):
def stats(self) -> None:
"""[finalizer] Displays the statistical information of the data."""
print(self.df.describe())

def showquery(self):
def showquery(self) -> None:
"""[finalizer] Displays the data processing query."""
print(self.df)

def show(self):
def show(self) -> None:
"""[finalizer] Outputs the processing results to the standard output."""
self.df.collect().write_csv(sys.stdout)

def showtable(self):
def showtable(self) -> None:
"""[finalizer] Outputs the processing results table to the standard output."""
print(self.df.collect())

def dump(self, path: str = None):
def dump(self, path: str = None) -> None:
"""[finalizer] Outputs the processing results to a CSV file."""
def autoname():
now = datetime.now().strftime('%Y%m%d-%H%M%S')
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cadd278

Please sign in to comment.