Skip to content

Commit

Permalink
Merge pull request #12 from ErykDarnowski/feat/9/add-csv-output-mode
Browse files Browse the repository at this point in the history
  • Loading branch information
ErykDarnowski authored Jan 23, 2024
2 parents 822de19 + 951965c commit fdb45f3
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# I/O files:
transcript.txt
output.jsonl
output.*

# ---

Expand Down
78 changes: 61 additions & 17 deletions __main__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/bin/env python3

import re
import csv
import json
import argparse

__author__ = "Eryk Darnowski"
__version__ = "0.0.0"
Expand All @@ -12,12 +14,20 @@ def is_sender(string):
return string[0] == " "

def main():
input_filename = "transcript.txt"
output_filename = "output.jsonl"
parser = argparse.ArgumentParser(description="")
parser.add_argument("-i", default="transcript.txt", help="Choose input filename")
parser.add_argument("-o", default="output", help="Choose output filename - no extension")
parser.add_argument("-f", choices=["csv", "json"], default="json", help="Choose output format (csv/json)")
args = parser.parse_args()

output_format = args.f
output_filename = args.o
input_filename = args.i
config = {
"role": "system",
"content": "Jestem Marek, chat bot bazowany na wiadomościach z Messengera, który ma na celu imitować chat z tą osobą.",
}

regex_patterns = [
"http(s)?:\/\/",
"(Przekazano wiadomość|przekazuje wiadomość)",
Expand All @@ -41,33 +51,67 @@ def main():
transcript = [line for line in transcript if not any(re.search(pattern, line) for pattern in regex_patterns)]


if (output_format == "csv"):
config = config["content"]

# split by sender / receiver + format
convo_list = [];
convo = { "messages": [ config ] }
convo_list = []
convo = [config] if (output_format == "csv") else { "messages": [ config ] }



# go through each line
for i in range(len(transcript) - 1):
curr_is_sender = is_sender(transcript[i])

# needs to be first to keep the: prompts first, answers later schema
if curr_is_sender:
convo["messages"].append({ "role": "user", "content": transcript[i].lstrip() })
if (output_format == "csv"):
# needs to be first to keep the: prompts first, answers later schema
if curr_is_sender:
if (len(convo) == 1):
convo.append(transcript[i].lstrip())
else:
convo[-1] += '\n' + transcript[i].lstrip()
else:
if (len(convo) > 1):
if (len(convo) == 2):
convo.append(transcript[i])
else:
convo[-1] += '\n' + transcript[i]

# making sure that the convo won't start with an answer to a non existant prompt
if (not curr_is_sender and is_sender(transcript[i + 1])):
if (len(convo) > 1):
convo_list.append(convo)
convo = [config]
else:
if (len(convo["messages"]) > 1):
convo["messages"].append({ "role": "assistant", "content": transcript[i] })
if curr_is_sender:
convo["messages"].append({ "role": "user", "content": transcript[i].lstrip() })
else:
if (len(convo["messages"]) > 1):
convo["messages"].append({ "role": "assistant", "content": transcript[i] })

# making sure that the convo won't start with an answer to a non existant prompt
if (not curr_is_sender and is_sender(transcript[i + 1])):
if (len(convo["messages"]) > 1):
convo_list.append(convo)
convo = { "messages": [ config ] }

# making sure that the convo won't start with an answer to a non existant prompt
if (not curr_is_sender and is_sender(transcript[i + 1])):
if (len(convo["messages"]) > 1):
convo_list.append(convo)
convo = { "messages": [ config ] }


# write output
output_filename += '.csv' if (output_format == "csv") else '.jsonl'

with open(output_filename, 'w', encoding='utf-8') as output_file:
for convo in convo_list:
json.dump(convo, output_file, ensure_ascii=False)
output_file.write('\n')
if (output_format == "csv"):
writer = csv.writer(output_file)

writer.writerow(['system', 'user', 'agent'])
for convo in convo_list:
writer.writerow(convo)
else:
for convo in convo_list:
json.dump(convo, output_file, ensure_ascii=False)
output_file.write('\n')


if __name__ == "__main__":
Expand Down

0 comments on commit fdb45f3

Please sign in to comment.