diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh index 9fd014c82..bfc21e54d 100644 --- a/.dev_scripts/dockerci.sh +++ b/.dev_scripts/dockerci.sh @@ -27,7 +27,7 @@ playwright install --with-deps chromium # install package pip install fastapi pydantic uvicorn docker sqlmodel transformers ray -pip install pymongo motor llama-index-storage-docstore-mongodb llama-index-storage-index-store-mongodb llama-index-readers-mongodb +pip install pymongo motor llama-index-storage-docstore-mongodb==0.1.3 llama-index-storage-index-store-mongodb==0.1.2 llama-index-readers-mongodb==0.1.7 pip install tensorflow pyclipper shapely tf_slim pip install moviepy diff --git a/apps/datascience_assistant/README.md b/apps/datascience_assistant/README.md index 961ce0d83..dc1f739d3 100644 --- a/apps/datascience_assistant/README.md +++ b/apps/datascience_assistant/README.md @@ -6,21 +6,27 @@ Detailed information can be found in the [documentation](../../docs/source/agent ## Quick Start Streamlit is a Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science. -To run the DS Assistant in streamlit, you need to install the Streamlit library. You can install it using pip: +To run the DS Assistant in streamlit, you need to install additional libraries. You can install it using pip: ```bash -pip install streamlit streamlit-jupyter +pip install streamlit mistune matplotlib nbconvert ``` -Then, you need to set + Then, you can run the DS Assistant using the following command: ```bash -streamlit run app.py +cd ../../ +streamlit run ./apps/datascience_assistant/app.py ``` After running the command, a new tab will open in your default web browser with the DS Assistant running. -The following are screenshots of the DS Assistant running in the browser: +you can upload your dataset and write your request. ![img_2.png](../../resources/data_science_assistant_streamlit_1.png) -you can view all of the codes and in streamlit + +After submitting your request, DS Assistant will automatically generate a plan for this request. +![img_2.png](../../resources/data_science_assistant_streamlit_4.png) + +After that, DS Assistant will automatically excute every task, you can view all of the codes and details in streamlit ![img_3.png](../../resources/data_science_assistant_streamlit_2.png) + After you have finished using the DS Assistant, you can directly convert the running process to a pdf ![img_5.png](../../resources/data_science_assistant_streamlit_3.png) diff --git a/apps/datascience_assistant/app.py b/apps/datascience_assistant/app.py index e590a1a66..db3dcbf89 100644 --- a/apps/datascience_assistant/app.py +++ b/apps/datascience_assistant/app.py @@ -1,23 +1,45 @@ import os +import sys import streamlit as st -from modelscope_agent.agents.data_science_assistant import DataScienceAssistant -from modelscope_agent.tools.metagpt_tools.tool_recommend import \ - TypeMatchToolRecommender -llm_config = { - 'model': 'qwen2-72b-instruct', - 'model_server': 'dashscope', -} -os.environ['DASHSCOPE_API_KEY'] = input( - 'Please input your dashscope api key: ') -data_science_assistant = DataScienceAssistant( - llm=llm_config, tool_recommender=TypeMatchToolRecommender(tools=[''])) -st.title('Data Science Assistant') -st.write( - 'This is a data science assistant that can help you with your data science tasks.' -) -st.write('Please input your request below and click the submit button.') -user_request = st.text_input('User Request') -if st.button('submit'): - data_science_assistant.run(user_request=user_request, streamlit=True) +os.environ['DASHSCOPE_API_KEY'] = 'YOUR_API_KEY' + + +def setup_project_paths(): + current_dir = os.path.dirname(os.path.abspath(__file__)) # noqa + project_root_path = os.path.abspath(os.path.join(current_dir, + '../../')) # noqa + sys.path.append(project_root_path) # noqa + + +if __name__ == '__main__': + setup_project_paths() + from modelscope_agent.agents.data_science_assistant import \ + DataScienceAssistant # noqa + from modelscope_agent.tools.metagpt_tools.tool_recommend import \ + TypeMatchToolRecommender # noqa + st.title('Data Science Assistant') + st.write( + 'This is a data science assistant that can help you with your data science tasks.' + ) + st.write( + 'Please input your request and upload files then click the submit button.' + ) + + files = st.file_uploader( + 'Please upload files that you need. ', accept_multiple_files=True) + last_file_name = '' + user_request = st.text_area('User Request') + if st.button('submit'): + llm_config = { + 'model': 'qwen2-72b-instruct', + 'model_server': 'dashscope', + } + data_science_assistant = DataScienceAssistant( + llm=llm_config, + tool_recommender=TypeMatchToolRecommender(tools=[''])) + for file in files: + with open(file.name, 'wb') as f: + f.write(file.getbuffer()) + data_science_assistant.run(user_request=user_request, streamlit=True) diff --git a/examples/apps/modelscope_agentfabric.ipynb b/examples/apps/modelscope_agentfabric.ipynb index 7fb7eb751..d47c42b5c 100644 --- a/examples/apps/modelscope_agentfabric.ipynb +++ b/examples/apps/modelscope_agentfabric.ipynb @@ -120,7 +120,7 @@ } ], "source": [ - "! sed -i 's#demo.launch()#demo.launch(share=True)#g' app.py && export PYTHONPATH=$PYTHONPATH:/content/modelscope-agent && python app.py" + "! export PYTHONPATH=$PYTHONPATH:/content/modelscope-agent && python app.py" ] } ], diff --git a/modelscope_agent/agents/data_science_assistant.py b/modelscope_agent/agents/data_science_assistant.py index db8ad2472..718f62105 100644 --- a/modelscope_agent/agents/data_science_assistant.py +++ b/modelscope_agent/agents/data_science_assistant.py @@ -1,6 +1,4 @@ # Implementation inspired by the paper "DATA INTERPRETER: AN LLM AGENT FOR DATA SCIENCE" -import asyncio -import copy import os import time from datetime import datetime @@ -39,8 +37,7 @@ - **other**: Any tasks not in the defined categories # Task: -Based on the context, write a simple plan or modify an existing plan of what you should do to achieve the goal. A plan \ -consists of one to four tasks. +Based on the context, write a simple plan or modify an existing plan of what you should do to achieve the goal. Output a list of jsons following the format: ```json @@ -55,6 +52,44 @@ ] ``` """ + +DECOMPOSE_TASK_TEMPLATE = """ +# Context: +{context} +# Available Task Types: +- **eda**: For performing exploratory data analysis +- **data preprocessing**: For preprocessing dataset in a data analysis or machine learning task ONLY,\ +general data operation doesn't fall into this type +- **feature engineering**: Only for creating new columns fo input data. +- **model train**: Only for training model. +- **model evaluate**: Only for evaluating model. +- **ocr**: Only for OCR tasks. +- **other**: Any tasks not in the defined categories + +# Previous Tasks +We have already generated the following tasks: +{previous_tasks} +# Task: +The current task is: +{current_task} +Currently, the current task is too complex to be executed in one step. Please decompose the task into smaller tasks, \ +and output a list of jsons following the format: +Output a list of jsons following the format: + +```json +[ + {{ + "task_id": str = "unique identifier for a task in plan, can be an ordinal, \ + should be unique and not conflict with previous task ids", + "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task", + "instruction": "what you should do in this task, one short phrase or sentence", + "task_type": "type of this task, should be one of Available Task Types", + }}, + ... +] +``` +""" + CODE_TEMPLATE = """ # Task you are a code generator, you need to generate a code python block in jupyter notebook to achieve the \ @@ -597,8 +632,8 @@ def _judge_code(self, task, previous_code_blocks, code, if 'incorrect' in judge_result.split('\n')[-1]: success = False failed_reason = ( - 'Though the code executes successfully, The code logic is incorrect, here is the reason: ' - + judge_result) + 'Though the code executes successfully, The code logic is \ + incorrect, here is the reason: ' + judge_result) return success, failed_reason else: @@ -634,7 +669,7 @@ def _run(self, user_request, save: bool = True, **kwargs): previous_code_blocks = self._get_previous_code_blocks() success = False code_counter = 0 - max_try = kwargs.get('max_try', 10) + max_try = kwargs.get('max_try', 1) while not success and code_counter < max_try: code_execute_success = False code_logic_success = False @@ -726,9 +761,13 @@ def _run(self, user_request, save: bool = True, **kwargs): encoding='utf-8') as file: nbformat.write(self.code_interpreter.nb, file) else: - self.plan = self._update_plan( - user_request=user_request, curr_plan=self.plan) - self.code_interpreter.reset() + decomposed_tasks = self._decompose_task(task) + if decomposed_tasks: + self.plan.replace_task(task, decomposed_tasks) + else: + self.plan = self._update_plan( + user_request=user_request, curr_plan=self.plan) + self.code_interpreter.reset() # save the plan into json file if save: after_time = time.time() @@ -769,3 +808,36 @@ def _get_total_tokens(self): except Exception as e: logger.error(f'get total token error: {e}') pass + + def _decompose_task(self, task): + try: + print(f'decompose task {task.task_id}') + messages = [{ + 'role': + 'user', + 'content': + DECOMPOSE_TASK_TEMPLATE.format( + context='User Request: ' + task.instruction + '\n', + previous_tasks='\n'.join([ + json.dumps({ + 'task_id': t.task_id, + 'dependent_task_ids': t.dependent_task_ids, + 'instruction': t.instruction, + 'task_type': t.task_type + }) for t in self.plan.tasks + ]), + current_task=json.dumps(task.__dict__)) + }] + resp = self._call_llm(prompt=None, messages=messages, stop=None) + tasks_text = '' + for r in resp: + tasks_text += r + tasks_text = parse_code(text=tasks_text, lang='json') + logger.info(f'decomposed tasks: {tasks_text}') + + tasks = json5.loads(tasks_text) + tasks = [Task(**task) for task in tasks] + return tasks + except Exception as e: + logger.error(f'decompose task error: {e}') + return None diff --git a/requirements.txt b/requirements.txt index 87fdd527d..789df15de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,8 @@ jupyter>=1.0.0 langchain langchain-community langchain-experimental -llama-index +llama-index==0.10.29 +llama-index-core==0.10.39.post1 llama-index-readers-json llama-index-retrievers-bm25==0.1.5 modelscope[framework]>=1.16.0 diff --git a/resources/data_science_assistant_streamlit_1.png b/resources/data_science_assistant_streamlit_1.png index e98b56c4a..7dc423e6d 100644 Binary files a/resources/data_science_assistant_streamlit_1.png and b/resources/data_science_assistant_streamlit_1.png differ diff --git a/resources/data_science_assistant_streamlit_4.png b/resources/data_science_assistant_streamlit_4.png new file mode 100644 index 000000000..f24499ec0 Binary files /dev/null and b/resources/data_science_assistant_streamlit_4.png differ