Merge remote-tracking branch 'origin/master' into feat/openapi_refactor

modelscope · Sep 30, 2024 · 51a285e · 51a285e
2 parents 955f860 + 195459c
commit 51a285e
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 38 deletions.
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
@@ -27,7 +27,7 @@ playwright install --with-deps chromium
 
 # install package
 pip install fastapi pydantic uvicorn docker sqlmodel transformers ray
-pip install pymongo motor llama-index-storage-docstore-mongodb llama-index-storage-index-store-mongodb llama-index-readers-mongodb
+pip install pymongo motor llama-index-storage-docstore-mongodb==0.1.3 llama-index-storage-index-store-mongodb==0.1.2 llama-index-readers-mongodb==0.1.7
 pip install tensorflow pyclipper shapely tf_slim
 pip install moviepy
 

diff --git a/apps/datascience_assistant/README.md b/apps/datascience_assistant/README.md
@@ -6,21 +6,27 @@ Detailed information can be found in the [documentation](../../docs/source/agent
 ## Quick Start
 Streamlit is a Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science.
 
-To run the DS Assistant in streamlit, you need to install the Streamlit library. You can install it using pip:
+To run the DS Assistant in streamlit, you need to install additional libraries. You can install it using pip:
 ```bash
-pip install streamlit streamlit-jupyter
+pip install streamlit mistune matplotlib nbconvert
 ```
-Then, you need to set
+
 Then, you can run the DS Assistant using the following command:
 ```bash
-streamlit run app.py
+cd ../../
+streamlit run ./apps/datascience_assistant/app.py
 ```
 
 After running the command, a new tab will open in your default web browser with the DS Assistant running.
-The following are screenshots of the DS Assistant running in the browser:
 
+you can upload your dataset and write your request.
 ![img_2.png](../../resources/data_science_assistant_streamlit_1.png)
-you can view all of the codes and  in streamlit
+
+After submitting your request, DS Assistant will automatically generate a plan for this request.
+![img_2.png](../../resources/data_science_assistant_streamlit_4.png)
+
+After that, DS Assistant will automatically excute every task, you can view all of the codes and details in streamlit
 ![img_3.png](../../resources/data_science_assistant_streamlit_2.png)
+
 After you have finished using the DS Assistant, you can directly convert the running process to a pdf
 ![img_5.png](../../resources/data_science_assistant_streamlit_3.png)
diff --git a/apps/datascience_assistant/app.py b/apps/datascience_assistant/app.py
@@ -1,23 +1,45 @@
 import os
+import sys
 
 import streamlit as st
-from modelscope_agent.agents.data_science_assistant import DataScienceAssistant
-from modelscope_agent.tools.metagpt_tools.tool_recommend import \
-    TypeMatchToolRecommender
 
-llm_config = {
-    'model': 'qwen2-72b-instruct',
-    'model_server': 'dashscope',
-}
-os.environ['DASHSCOPE_API_KEY'] = input(
-    'Please input your dashscope api key: ')
-data_science_assistant = DataScienceAssistant(
-    llm=llm_config, tool_recommender=TypeMatchToolRecommender(tools=['<all>']))
-st.title('Data Science Assistant')
-st.write(
-    'This is a data science assistant that can help you with your data science tasks.'
-)
-st.write('Please input your request below and click the submit button.')
-user_request = st.text_input('User Request')
-if st.button('submit'):
-    data_science_assistant.run(user_request=user_request, streamlit=True)
+os.environ['DASHSCOPE_API_KEY'] = 'YOUR_API_KEY'
+
+
+def setup_project_paths():
+    current_dir = os.path.dirname(os.path.abspath(__file__))  # noqa
+    project_root_path = os.path.abspath(os.path.join(current_dir,
+                                                     '../../'))  # noqa
+    sys.path.append(project_root_path)  # noqa
+
+
+if __name__ == '__main__':
+    setup_project_paths()
+    from modelscope_agent.agents.data_science_assistant import \
+        DataScienceAssistant  # noqa
+    from modelscope_agent.tools.metagpt_tools.tool_recommend import \
+        TypeMatchToolRecommender  # noqa
+    st.title('Data Science Assistant')
+    st.write(
+        'This is a data science assistant that can help you with your data science tasks.'
+    )
+    st.write(
+        'Please input your request and upload files then click the submit button.'
+    )
+
+    files = st.file_uploader(
+        'Please upload files that you need. ', accept_multiple_files=True)
+    last_file_name = ''
+    user_request = st.text_area('User Request')
+    if st.button('submit'):
+        llm_config = {
+            'model': 'qwen2-72b-instruct',
+            'model_server': 'dashscope',
+        }
+        data_science_assistant = DataScienceAssistant(
+            llm=llm_config,
+            tool_recommender=TypeMatchToolRecommender(tools=['<all>']))
+        for file in files:
+            with open(file.name, 'wb') as f:
+                f.write(file.getbuffer())
+        data_science_assistant.run(user_request=user_request, streamlit=True)
diff --git a/examples/apps/modelscope_agentfabric.ipynb b/examples/apps/modelscope_agentfabric.ipynb
@@ -120,7 +120,7 @@
         }
       ],
       "source": [
-        "! sed -i 's#demo.launch()#demo.launch(share=True)#g' app.py && export PYTHONPATH=$PYTHONPATH:/content/modelscope-agent && python app.py"
+        "! export PYTHONPATH=$PYTHONPATH:/content/modelscope-agent && python app.py"
       ]
     }
   ],

diff --git a/modelscope_agent/agents/data_science_assistant.py b/modelscope_agent/agents/data_science_assistant.py
@@ -1,6 +1,4 @@
 # Implementation inspired by the paper "DATA INTERPRETER: AN LLM AGENT FOR DATA SCIENCE"
-import asyncio
-import copy
 import os
 import time
 from datetime import datetime
@@ -39,8 +37,7 @@
 - **other**: Any tasks not in the defined categories
 
 # Task:
-Based on the context, write a simple plan or modify an existing plan of what you should do to achieve the goal. A plan \
-consists of one to four tasks.
+Based on the context, write a simple plan or modify an existing plan of what you should do to achieve the goal.
 
 Output a list of jsons following the format:
 ```json
@@ -55,6 +52,44 @@
 ]
 ```
 """
+
+DECOMPOSE_TASK_TEMPLATE = """
+# Context:
+{context}
+# Available Task Types:
+- **eda**: For performing exploratory data analysis
+- **data preprocessing**: For preprocessing dataset in a data analysis or machine learning task ONLY,\
+general data operation doesn't fall into this type
+- **feature engineering**: Only for creating new columns fo input data.
+- **model train**: Only for training model.
+- **model evaluate**: Only for evaluating model.
+- **ocr**: Only for OCR tasks.
+- **other**: Any tasks not in the defined categories
+
+# Previous Tasks
+We have already generated the following tasks:
+{previous_tasks}
+# Task:
+The current task is:
+{current_task}
+Currently, the current task is too complex to be executed in one step. Please decompose the task into smaller tasks, \
+and output a list of jsons following the format:
+Output a list of jsons following the format:
+
+```json
+[
+    {{
+        "task_id": str = "unique identifier for a task in plan, can be an ordinal, \
+        should be unique and not conflict with previous task ids",
+        "dependent_task_ids": list[str] = "ids of tasks prerequisite to this task",
+        "instruction": "what you should do in this task, one short phrase or sentence",
+        "task_type": "type of this task, should be one of Available Task Types",
+    }},
+    ...
+]
+```
+"""
+
 CODE_TEMPLATE = """
 # Task
 you are a code generator, you need to generate a code python block in jupyter notebook to achieve the \
@@ -597,8 +632,8 @@ def _judge_code(self, task, previous_code_blocks, code,
         if 'incorrect' in judge_result.split('\n')[-1]:
             success = False
             failed_reason = (
-                'Though the code executes successfully, The code logic is incorrect, here is the reason: '
-                + judge_result)
+                'Though the code executes successfully, The code logic is \
+                incorrect, here is the reason: ' + judge_result)
             return success, failed_reason
 
         else:
@@ -634,7 +669,7 @@ def _run(self, user_request, save: bool = True, **kwargs):
                 previous_code_blocks = self._get_previous_code_blocks()
                 success = False
                 code_counter = 0
-                max_try = kwargs.get('max_try', 10)
+                max_try = kwargs.get('max_try', 1)
                 while not success and code_counter < max_try:
                     code_execute_success = False
                     code_logic_success = False
@@ -726,9 +761,13 @@ def _run(self, user_request, save: bool = True, **kwargs):
                                 encoding='utf-8') as file:
                             nbformat.write(self.code_interpreter.nb, file)
                 else:
-                    self.plan = self._update_plan(
-                        user_request=user_request, curr_plan=self.plan)
-                    self.code_interpreter.reset()
+                    decomposed_tasks = self._decompose_task(task)
+                    if decomposed_tasks:
+                        self.plan.replace_task(task, decomposed_tasks)
+                    else:
+                        self.plan = self._update_plan(
+                            user_request=user_request, curr_plan=self.plan)
+                        self.code_interpreter.reset()
             # save the plan into json file
             if save:
                 after_time = time.time()
@@ -769,3 +808,36 @@ def _get_total_tokens(self):
         except Exception as e:
             logger.error(f'get total token error: {e}')
         pass
+
+    def _decompose_task(self, task):
+        try:
+            print(f'decompose task {task.task_id}')
+            messages = [{
+                'role':
+                'user',
+                'content':
+                DECOMPOSE_TASK_TEMPLATE.format(
+                    context='User Request: ' + task.instruction + '\n',
+                    previous_tasks='\n'.join([
+                        json.dumps({
+                            'task_id': t.task_id,
+                            'dependent_task_ids': t.dependent_task_ids,
+                            'instruction': t.instruction,
+                            'task_type': t.task_type
+                        }) for t in self.plan.tasks
+                    ]),
+                    current_task=json.dumps(task.__dict__))
+            }]
+            resp = self._call_llm(prompt=None, messages=messages, stop=None)
+            tasks_text = ''
+            for r in resp:
+                tasks_text += r
+            tasks_text = parse_code(text=tasks_text, lang='json')
+            logger.info(f'decomposed tasks: {tasks_text}')
+
+            tasks = json5.loads(tasks_text)
+            tasks = [Task(**task) for task in tasks]
+            return tasks
+        except Exception as e:
+            logger.error(f'decompose task error: {e}')
+            return None
diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,8 @@ jupyter>=1.0.0
 langchain
 langchain-community
 langchain-experimental
-llama-index
+llama-index==0.10.29
+llama-index-core==0.10.39.post1
 llama-index-readers-json
 llama-index-retrievers-bm25==0.1.5
 modelscope[framework]>=1.16.0

diff --git a/resources/data_science_assistant_streamlit_1.png b/resources/data_science_assistant_streamlit_1.png
diff --git a/resources/data_science_assistant_streamlit_4.png b/resources/data_science_assistant_streamlit_4.png