Updating orcid2 to fit Sinas specification

DeSci-md · Apr 29, 2024 · 782ab53 · 782ab53
1 parent 61a5ca8
commit 782ab53
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 134 deletions.
diff --git a/app/langchain_orcid.py b/app/langchain_orcid.py
@@ -398,43 +398,6 @@ def get_orcid(authors):
 
     return orcid_info
 
-"""def get_orcid(authors): 
-    orcid_info = []  # Dictionary to store author information
-    
-    for author in authors: 
-        try: 
-            url = "https://api.openalex.org/authors?search=" + author
-            response = json.loads(requests.get(url).text)
-        except Exception as e:  # Added variable 'e' to catch the exception
-            print(f"OpenAlex ORCID lookup returned error: {e}\n")
-            continue  # Skip to the next author
-        
-        #print(response)
-        if response["meta"]["count"] >= 1:
-            orcid = response["results"][0]["orcid"]
-            print(orcid)
-            affiliation = response["results"][0]["affiliations"][0]["institution"]["display_name"]
-            display_name = response["results"][0]["display_name"]  # Updated to use display_name
-
-            author_info = {
-                "@id": f"https://orcid.org/{orcid}",
-                "role": "Person",
-                "affiliation": affiliation,
-                "name": display_name
-            }
-
-            orcid_info.append(author_info)
-
-        else:
-            print("None, There are no OrcID suggestions for this author")
-            author_info = "none"
-            orcid_info[author] = author_info
-            continue  # Skip to the next author
-
-
-
-    return orcid_info
-"""
 def update_json_ld(json_ld, new_data):
     # Process author information
     loop = 0

diff --git a/app/langchain_orcid2.py b/app/langchain_orcid2.py
@@ -44,37 +44,12 @@
 load_dotenv(find_dotenv())
 
 
-def num_tokens_from_string(string: str, encoding_name: str) -> int:
+"""def num_tokens_from_string(string: str, encoding_name: str) -> int:
     encoding = tiktoken.encoding_for_model(encoding_name)
     num_tokens = len(encoding.encode(string))
-    return num_tokens
-
-def get_jsonld(node):
-    base = "https://beta.dpid.org/"
-    root = "?jsonld"
-
-    #return requested node JSON
-    response2 = requests.get(base+node+root).json()
-    return response2
-#https://ipfs.desci.com/ipfs/bafkreihge5qw7sc3mqc4wkf4cgpv6udtvrgipfxwyph7dhlyu6bkkt7tfq
-def get_pdf_text(CIDurl):
-    #base = "https://beta.dpid.org/"
-    #root = "?raw"
-
-    #return most recent node JSON
-    manifest = requests.get(CIDurl).json()
-
-    #get the CID associated with the Payload + PDF object
-    try:
-        pdf_path = next(item['payload']['path'] for item in manifest['components'] if item['type'] == 'pdf')
-
-        pdf_url = next(item['payload']['url'] for item in manifest['components'] if item['type'] == 'pdf')
+    return num_tokens"""
 
-    except: 
-        return "No PDF object found"
-
-    #url = base+pdf_path+"?raw"
-
+def get_pdf_text(pdf_url):
     ipfs="https://ipfs.desci.com/ipfs/"+pdf_url
 
     response = requests.get(ipfs) 
@@ -186,7 +161,7 @@ def paper_data_json_single(doi):
     if authors_info:
         authors_info = get_orcid(authors_info)
     else: 
-        authors_info.append = {'None, no authors returned by CrossRef'}
+        authors_info = "None"
 
 
     refs = []
@@ -286,16 +261,13 @@ def paper_data_json_single(doi):
     output_dict = {
         # Paper Metadata
         'title':title,
-        'creator':authors_info,
+        'creator': authors_info,
         'datePublished':pub_date,
         'keywords':keywords,
-        #'references':refs,
-        #'tldr':tldr,
     }
-    print("This is the output_dict \n", output_dict)
     return output_dict
 
-async def langchain_paper_search(node):
+async def langchain_paper_search(pdf_CID):
     #file_path
     """
     Analyzes a pdf document defined by file_path and asks questions regarding the text
@@ -323,17 +295,11 @@ async def langchain_paper_search(node):
 
     #%% Extracting info from paper
     # Define the PDF document, load it in
-    text = get_pdf_text(node)
+    text = get_pdf_text(pdf_CID)
     document = Document(page_content = text)
 
     # Define all the queries and corresponding schemas in a list
     queries_schemas_docs = [
-        #("What are the experimental methods and techniques used by the authors? This can include ways that data was collected as well as ways the samples were synthesized.", document),
-        #("What is the scientific question, challenge, or motivation that the authors are trying to address?", document),
-        #("Provide a summary of the results and discussions in the paper. What results were obtained and what conclusions were reached?", document),
-        #("Provide a summary of each figure described in the paper. Your response should be a one sentence summary of the figure description, \
-        # beginning with 'Fig. #  - description...'. For example:'Fig. 1 - description..., Fig. 2 - description..., Fig. 3 - description...'. Separate each figure description by a single newline.", document),
-        #("What future work or unanswered questions are mentioned by the authors?", document),
         ("Tell me who all the authors of this paper are. Your response should be a comma separated list of the authors of the paper, \
          looking like 'first author name, second author name", document),
         ("Tell me the title of this paper", document)
@@ -349,21 +315,14 @@ async def langchain_paper_search(node):
     summary = await asyncio.gather(*tasks)
 
     # Extracting individual elements from the summary
-    #methods, motive, results, future figures, 
-    authors, title = summary  #NOTE: output to variables in strings
+    authors, title = summary 
 
     llm_output = {
-        #"motive": motive,
-        #"method": methods,
-        #"figures": figures,
-        #"results": results,
-        #"future": future,
         "authors": authors,
         "title": title
     }
 
-    #transform outputs
-    #llm_output['figures'] = llm_output['figures'].split("\n")# using newline character as a split point.
+    #transform outputs into comma separated lists and then into a structured dictionary of authors. 
     llm_output['authors'] = llm_output['authors'].split(', ') 
     llm_output['authors'] = get_orcid(llm_output["authors"])
 
@@ -403,29 +362,6 @@ def get_orcid(authors):
 
     return orcid_info
 
-"""def get_orcid(authors): 
-    orcid = []
-    author_info = {}   
-
-    for author in authors: 
-        try: 
-            url = "https://api.openalex.org/autocomplete/authors?q=" + author
-            response = json.loads(requests.get(url).text)
-        except: 
-            print(f"OpenAlex ORCID lookup returned error: {e}\n")
-        
-        if response["meta"]["count"] == 1: 
-            orcid = response["results"][0]["external_id"]
-            author_info[author] = {"orcid": orcid, "affiliation":response["results"][0]["hint"]}
-        elif response["meta"]["count"] == 0: #FAKE - Create a test so we can check if the return is valid. 
-            print("None, There are no OrcID suggestions for this author")
-        else: 
-            orcid = response["results"][0]["external_id"]
-            author_info[author] = {"orcid": orcid, "affiliation": response["results"][0]["hint"]}
-            #create an async function which ranks the authors based on the similarity to the paper. 
-
-    return author_info"""
-
 def update_json_ld(json_ld, new_data):
     # Process author information
     loop = 0
@@ -457,41 +393,37 @@ def update_json_ld(json_ld, new_data):
         else:
             json_ld["@graph"][1][key.lower()] = value
             print("I'm adding: " + str(value))
-        #print(loop)
     return json_ld
 
 
 #%% Main, general case for testing
-def run(node, doi=None): 
+def run(pdf, doi=None): 
     print("Starting code run...")
-
-    #node = "46" #os.getenv('NODE_ENV')
-    #DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #
 
-    if node is not None:
-        print(f"NODE_ENVIRONMENT is set to: {node}")
+    if [pdf] is not None:
+        print(f"NODE_ENVIRONMENT is set to: {pdf}")
     else:
         print("NODE_ENVIRONMENT is not set.")
 
-    json_ld = get_jsonld(node)
-    print(json_ld)
-
     if doi: 
         lookup_results = paper_data_json_single(doi)
-        #updated_json_ld = update_json_ld(json_ld, lookup_results)
-
+        if lookup_results['creator'] is None:  # Check if author_info is None
+            print("No author information found. Running language chain search.")
+            llm_output = asyncio.run(langchain_paper_search(pdf))
+            output = llm_output
+        else:
+            output = lookup_results
+
     else: 
-        updated_json_ld = json_ld
-
-    llm_output = asyncio.run(langchain_paper_search(node))# output of unstructured text in dictionary
-    #updated_json_ld = update_json_ld(json_ld, llm_output)
-    updated_json_ld = json_ld
-
-    #doi = "https://doi.org/10.1002/adma.202208113"
+        llm_output = asyncio.run(langchain_paper_search(pdf))
+        output = llm_output
 
-    #print(updated_json_ld)
-
+    return output
     print("Script completed")
 
 """if __name__ == "__main__":
- run("46", "https://doi.org/10.1002/adma.202208113")"""
+ run("bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4", "https://doi.org/10.1002/adma.202208113")
+ curl -X POST -H "Content-Type: application/json" -d '{"pdf": "bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4", "doi": "https://doi.org/10.1002/adma.202208113"}' http://localhost:5001/invoke-script
+  curl -X POST -H "Content-Type: application/json" -d '{"pdf": "bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4"}' http://localhost:5001/invoke-script
+
+ """
diff --git a/app/server.py b/app/server.py
@@ -1,13 +1,18 @@
 from flask import Flask, request, jsonify
-import langchain_api  # Import your script
+import langchain_orcid2  # Import your script
 
 app = Flask(__name__)
 
 @app.route('/invoke-script', methods=['POST'])
 def invoke_script():
     data = request.json  # Assuming JSON data is sent from the web app
+    pdf = data.get("pdf")
+    doi = data.get("doi")
     # Call your script with the appropriate inputs
-    output = langchain_api.run(data['node'], data['doi'])
+    if doi is not None:
+        output = langchain_orcid2.run(pdf, doi)
+    else: 
+        output = langchain_orcid2.run(pdf)
     return jsonify({'output': output})
 
 if __name__ == '__main__':