Skip to content

Commit

Permalink
Updating orcid2 to fit Sinas specification
Browse files Browse the repository at this point in the history
  • Loading branch information
Plikt committed Apr 29, 2024
1 parent 61a5ca8 commit 782ab53
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 134 deletions.
37 changes: 0 additions & 37 deletions app/langchain_orcid.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,43 +398,6 @@ def get_orcid(authors):

return orcid_info

"""def get_orcid(authors):
orcid_info = [] # Dictionary to store author information
for author in authors:
try:
url = "https://api.openalex.org/authors?search=" + author
response = json.loads(requests.get(url).text)
except Exception as e: # Added variable 'e' to catch the exception
print(f"OpenAlex ORCID lookup returned error: {e}\n")
continue # Skip to the next author
#print(response)
if response["meta"]["count"] >= 1:
orcid = response["results"][0]["orcid"]
print(orcid)
affiliation = response["results"][0]["affiliations"][0]["institution"]["display_name"]
display_name = response["results"][0]["display_name"] # Updated to use display_name
author_info = {
"@id": f"https://orcid.org/{orcid}",
"role": "Person",
"affiliation": affiliation,
"name": display_name
}
orcid_info.append(author_info)
else:
print("None, There are no OrcID suggestions for this author")
author_info = "none"
orcid_info[author] = author_info
continue # Skip to the next author
return orcid_info
"""
def update_json_ld(json_ld, new_data):
# Process author information
loop = 0
Expand Down
122 changes: 27 additions & 95 deletions app/langchain_orcid2.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,12 @@
load_dotenv(find_dotenv())


def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""def num_tokens_from_string(string: str, encoding_name: str) -> int:
encoding = tiktoken.encoding_for_model(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens

def get_jsonld(node):
base = "https://beta.dpid.org/"
root = "?jsonld"

#return requested node JSON
response2 = requests.get(base+node+root).json()
return response2
#https://ipfs.desci.com/ipfs/bafkreihge5qw7sc3mqc4wkf4cgpv6udtvrgipfxwyph7dhlyu6bkkt7tfq
def get_pdf_text(CIDurl):
#base = "https://beta.dpid.org/"
#root = "?raw"

#return most recent node JSON
manifest = requests.get(CIDurl).json()

#get the CID associated with the Payload + PDF object
try:
pdf_path = next(item['payload']['path'] for item in manifest['components'] if item['type'] == 'pdf')

pdf_url = next(item['payload']['url'] for item in manifest['components'] if item['type'] == 'pdf')
return num_tokens"""

except:
return "No PDF object found"

#url = base+pdf_path+"?raw"

def get_pdf_text(pdf_url):
ipfs="https://ipfs.desci.com/ipfs/"+pdf_url

response = requests.get(ipfs)
Expand Down Expand Up @@ -186,7 +161,7 @@ def paper_data_json_single(doi):
if authors_info:
authors_info = get_orcid(authors_info)
else:
authors_info.append = {'None, no authors returned by CrossRef'}
authors_info = "None"


refs = []
Expand Down Expand Up @@ -286,16 +261,13 @@ def paper_data_json_single(doi):
output_dict = {
# Paper Metadata
'title':title,
'creator':authors_info,
'creator': authors_info,
'datePublished':pub_date,
'keywords':keywords,
#'references':refs,
#'tldr':tldr,
}
print("This is the output_dict \n", output_dict)
return output_dict

async def langchain_paper_search(node):
async def langchain_paper_search(pdf_CID):
#file_path
"""
Analyzes a pdf document defined by file_path and asks questions regarding the text
Expand Down Expand Up @@ -323,17 +295,11 @@ async def langchain_paper_search(node):

#%% Extracting info from paper
# Define the PDF document, load it in
text = get_pdf_text(node)
text = get_pdf_text(pdf_CID)
document = Document(page_content = text)

# Define all the queries and corresponding schemas in a list
queries_schemas_docs = [
#("What are the experimental methods and techniques used by the authors? This can include ways that data was collected as well as ways the samples were synthesized.", document),
#("What is the scientific question, challenge, or motivation that the authors are trying to address?", document),
#("Provide a summary of the results and discussions in the paper. What results were obtained and what conclusions were reached?", document),
#("Provide a summary of each figure described in the paper. Your response should be a one sentence summary of the figure description, \
# beginning with 'Fig. # - description...'. For example:'Fig. 1 - description..., Fig. 2 - description..., Fig. 3 - description...'. Separate each figure description by a single newline.", document),
#("What future work or unanswered questions are mentioned by the authors?", document),
("Tell me who all the authors of this paper are. Your response should be a comma separated list of the authors of the paper, \
looking like 'first author name, second author name", document),
("Tell me the title of this paper", document)
Expand All @@ -349,21 +315,14 @@ async def langchain_paper_search(node):
summary = await asyncio.gather(*tasks)

# Extracting individual elements from the summary
#methods, motive, results, future figures,
authors, title = summary #NOTE: output to variables in strings
authors, title = summary

llm_output = {
#"motive": motive,
#"method": methods,
#"figures": figures,
#"results": results,
#"future": future,
"authors": authors,
"title": title
}

#transform outputs
#llm_output['figures'] = llm_output['figures'].split("\n")# using newline character as a split point.
#transform outputs into comma separated lists and then into a structured dictionary of authors.
llm_output['authors'] = llm_output['authors'].split(', ')
llm_output['authors'] = get_orcid(llm_output["authors"])

Expand Down Expand Up @@ -403,29 +362,6 @@ def get_orcid(authors):

return orcid_info

"""def get_orcid(authors):
orcid = []
author_info = {}
for author in authors:
try:
url = "https://api.openalex.org/autocomplete/authors?q=" + author
response = json.loads(requests.get(url).text)
except:
print(f"OpenAlex ORCID lookup returned error: {e}\n")
if response["meta"]["count"] == 1:
orcid = response["results"][0]["external_id"]
author_info[author] = {"orcid": orcid, "affiliation":response["results"][0]["hint"]}
elif response["meta"]["count"] == 0: #FAKE - Create a test so we can check if the return is valid.
print("None, There are no OrcID suggestions for this author")
else:
orcid = response["results"][0]["external_id"]
author_info[author] = {"orcid": orcid, "affiliation": response["results"][0]["hint"]}
#create an async function which ranks the authors based on the similarity to the paper.
return author_info"""

def update_json_ld(json_ld, new_data):
# Process author information
loop = 0
Expand Down Expand Up @@ -457,41 +393,37 @@ def update_json_ld(json_ld, new_data):
else:
json_ld["@graph"][1][key.lower()] = value
print("I'm adding: " + str(value))
#print(loop)
return json_ld


#%% Main, general case for testing
def run(node, doi=None):
def run(pdf, doi=None):
print("Starting code run...")

#node = "46" #os.getenv('NODE_ENV')
#DOI_env = "10.3847/0004-637X/828/1/46"#os.getenv('DOI_ENV') #

if node is not None:
print(f"NODE_ENVIRONMENT is set to: {node}")
if [pdf] is not None:
print(f"NODE_ENVIRONMENT is set to: {pdf}")
else:
print("NODE_ENVIRONMENT is not set.")

json_ld = get_jsonld(node)
print(json_ld)

if doi:
lookup_results = paper_data_json_single(doi)
#updated_json_ld = update_json_ld(json_ld, lookup_results)

if lookup_results['creator'] is None: # Check if author_info is None
print("No author information found. Running language chain search.")
llm_output = asyncio.run(langchain_paper_search(pdf))
output = llm_output
else:
output = lookup_results

else:
updated_json_ld = json_ld

llm_output = asyncio.run(langchain_paper_search(node))# output of unstructured text in dictionary
#updated_json_ld = update_json_ld(json_ld, llm_output)
updated_json_ld = json_ld

#doi = "https://doi.org/10.1002/adma.202208113"
llm_output = asyncio.run(langchain_paper_search(pdf))
output = llm_output

#print(updated_json_ld)

return output
print("Script completed")

"""if __name__ == "__main__":
run("46", "https://doi.org/10.1002/adma.202208113")"""
run("bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4", "https://doi.org/10.1002/adma.202208113")
curl -X POST -H "Content-Type: application/json" -d '{"pdf": "bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4", "doi": "https://doi.org/10.1002/adma.202208113"}' http://localhost:5001/invoke-script
curl -X POST -H "Content-Type: application/json" -d '{"pdf": "bafybeiamslevhsvjlnfejg7p2rzk6bncioaapwb3oauu7zqwmfpwko5ho4"}' http://localhost:5001/invoke-script
"""
9 changes: 7 additions & 2 deletions app/server.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from flask import Flask, request, jsonify
import langchain_api # Import your script
import langchain_orcid2 # Import your script

app = Flask(__name__)

@app.route('/invoke-script', methods=['POST'])
def invoke_script():
data = request.json # Assuming JSON data is sent from the web app
pdf = data.get("pdf")
doi = data.get("doi")
# Call your script with the appropriate inputs
output = langchain_api.run(data['node'], data['doi'])
if doi is not None:
output = langchain_orcid2.run(pdf, doi)
else:
output = langchain_orcid2.run(pdf)
return jsonify({'output': output})

if __name__ == '__main__':
Expand Down

0 comments on commit 782ab53

Please sign in to comment.