Skip to content

Commit

Permalink
Merge pull request #12 from ML-Dev-Hub/cli_python
Browse files Browse the repository at this point in the history
Cli python
  • Loading branch information
saeedahmadicp authored Mar 24, 2023
2 parents 4718c4c + 41efe8e commit 844f169
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 46 deletions.
5 changes: 0 additions & 5 deletions __main__.py

This file was deleted.

17 changes: 0 additions & 17 deletions cli.py

This file was deleted.

28 changes: 28 additions & 0 deletions csv_trans/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from argparse import ArgumentParser
from csv_trans.translate import translate

def parse_arguments_from_cli(parser : ArgumentParser):
'''
Parse the arguments from the command line
----------------
Parameters:
parser: ArgumentParser
The parser to parse the arguments from the command line
----------------
Returns:
parser: ArgumentParser
The parser with the arguments parsed from the command line
'''

parser.add_argument('-f', '--file_path', type=str, required=True, help='file path')
parser.add_argument('-fs', '--file_separator', type=str, default=',', required=False, help='file separator')
parser.add_argument('-sl', '--source_language', type=str, required=True, help='source language')
parser.add_argument('-tl', '--target_language', type=str, required=True, help='target language')
return parser

def main():
parser = ArgumentParser()
parser = parse_arguments_from_cli(parser)
args = parser.parse_args()
translate(args.file_path, args.source_language, args.target_language, args.file_separator)
17 changes: 16 additions & 1 deletion csv_trans/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,22 @@
def translate(file: str, source_lang: str, target_lang: str, sep: str = ',') -> None:
"""
Translates the data in the file to the target language and saves the result
----------------
Parameters:
file: str
The path to the file
source_lang: str
The source language of the data
target_lang: str
The target language to translate the data to
sep: str
The separator of the file
----------------
Returns:
None
"""

encoding_scheme = detect_encoding_scheme(file)
# read the data from the csv file
data = read_csv_file(file, encoding_scheme, sep)
Expand All @@ -27,7 +42,7 @@ def translate(file: str, source_lang: str, target_lang: str, sep: str = ',') ->
pbar.update()

# save the data to the csv file
save_csv_file(data, file, encoding_scheme)
save_csv_file(data, file, encoding_scheme, target_lang)


# make file_separator optional
Expand Down
112 changes: 90 additions & 22 deletions csv_trans/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,18 @@


def detect_encoding_scheme(file_path):
"""Detect encoding scheme of a CSV file"""
"""
Detect encoding scheme of a CSV file
--------------------
Parameters:
file_path: str
The path to the file
--------------------
Returns:
encoding_scheme: str
The encoding scheme of the file
"""
try:
with open(file_path, 'rb') as f:
rawdata = f.read(200)
Expand All @@ -28,6 +39,15 @@ def detect_encoding_scheme(file_path):
def validate_dataframe(df):
"""
Check if the data is a valid dataframe and not empty.
--------------------
Parameters:
df: pandas.DataFrame
The dataframe to be validated
--------------------
Returns:
bool
True if the dataframe is valid and not empty, False otherwise
"""
if isinstance(df, pd.DataFrame) and not df.empty:
return True
Expand All @@ -38,13 +58,23 @@ def translate_text(texts, target_language, source_language='en', chunk_size=4000
"""
Translate the text into the target language using Google Translator API
:param texts: List of texts to be translated
:param source_language: The language of the input text
:param target_language: The target language to translate the text to
:param chunk_size: The size of chunk to split the input text
:param timeout: Timeout length for the request
:return:
--------------------
Parameters:
texts: list
List of texts to be translated
source_language: str
The language of the input text
target_language: str
The target language to translate the text to
chunk_size: int
The size of chunk to split the input text
timeout: int
Timeout length for the request
--------------------
Returns:
translations: list
The list of translated texts
"""
translations = []

Expand Down Expand Up @@ -83,9 +113,17 @@ def translate_text(texts, target_language, source_language='en', chunk_size=4000
def split_text_data(text, chunk_size):
"""
Split the input data/text into a fixed chunk size
:param text: the input data to be split
:param chunk_size: The chunk size to split the data
----------------
Parameters:
text: str
The input data to be split
chunk_size: int
The chunk size to split the data
----------------
Returns:
chunks: list
The list of chunks
"""
chunks = []
start = 0
Expand All @@ -112,6 +150,19 @@ def split_text_data(text, chunk_size):
def translate_dataframe(df, source_language, target_language):
"""
Translate a given pandas DataFrame to a desired language
----------------
Parameters:
df: pandas.DataFrame
The dataframe to be translated
source_language: str
The language of the input text
target_language: str
The target language to translate the text to
----------------
Returns:
result_df: pandas.DataFrame
The translated dataframe
"""
# Determine the number of threads to use based on the number of available CPU cores
num_threads = min(cpu_count(), len(df.columns))
Expand All @@ -132,11 +183,21 @@ def translate_dataframe(df, source_language, target_language):
def read_csv_file(file_path, encoding_scheme, separator=','):
"""
Read a CSV file using the given encoding scheme and delimiter
:param file_path: the path to the input file
:param encoding_scheme: the encoding to use when reading the file
:param separator: the delimiter to use when reading the CSV file
----------------
Parameters:
file_path: str
The path to the input file
encoding_scheme: str
The encoding to use when reading the file
separator: str
The delimiter to use when reading the CSV file
----------------
Returns:
data: pandas.DataFrame
The DataFrame containing the data from the CSV file
"""

try:
data = pd.read_csv(file_path, encoding=encoding_scheme, sep=separator, engine='pyarrow')
return data
Expand All @@ -145,20 +206,27 @@ def read_csv_file(file_path, encoding_scheme, separator=','):
return None


def save_csv_file(df, file_path, encoding_scheme):
def save_csv_file(df, file_path, encoding_scheme, target_language):
"""
Save a pandas DataFrame to a CSV file
:param df: the DataFrame to be saved as CSV
:param file_path: the full path including the filename of the output file
:param encoding_scheme: the encoding scheme to use when saving the CSV file
----------------
Parameters:
df: pandas.DataFrame
The DataFrame to be saved as CSV
file_path: str
The full path including the filename of the output file
encoding_scheme: str
The encoding scheme to use when saving the CSV file
target_language: str
The target language of the translated text
"""

path = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
try:
df.to_csv(os.path.join(path, "translated_" + file_name), encoding=encoding_scheme, index=False)
df.to_csv(os.path.join(path, "translated_" + target_language + '_' + file_name), encoding=encoding_scheme, index=False)
except UnicodeEncodeError:
df.to_csv(os.path.join(path, "translated_" + file_name), encoding='utf-8', index=False)
df.to_csv(os.path.join(path, "translated_" + target_language + '_' + file_name), encoding='utf-8', index=False)
except Exception as e:
print(f"Error saving file {file_name}: {e}")
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
],
entry_points={
"console_scripts": [
"csv_trans = csv_trans.__main__:cli"
"csv_trans = csv_trans.cli:main"
]
},
keywords=['python', 'csv', 'translate', 'translator', 'google',
Expand Down

0 comments on commit 844f169

Please sign in to comment.