Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance Metrics #38

Merged
merged 56 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
5a2e0be
adds timeit functionality to chat and query functions
jeisenman23 Nov 14, 2024
86e2a51
reformatting time metrics
jeisenman23 Nov 14, 2024
93a7d97
trying to fix lint
jeisenman23 Nov 15, 2024
4e4dc4b
fixing lint
jeisenman23 Nov 15, 2024
fab34d9
reducing complexity
jeisenman23 Nov 15, 2024
785ec4d
removing white spaces
jeisenman23 Nov 15, 2024
b0429a4
fixing lint
jeisenman23 Nov 15, 2024
a41b030
fixing return statement
jeisenman23 Nov 15, 2024
dba79f8
fixing docs
jeisenman23 Nov 15, 2024
dc4579a
removing timeit as optional argument
jeisenman23 Nov 21, 2024
bfdd13f
changing performance metrics according to stream mode:
jeisenman23 Nov 21, 2024
96be266
update test
jeisenman23 Nov 21, 2024
6d37186
change test to fit performance metrics
jeisenman23 Dec 10, 2024
4aa6fb2
change test to fit performance metrics
jeisenman23 Dec 10, 2024
0ac1ca9
adding back query into chat
jeisenman23 Dec 10, 2024
f8c3d1a
removing await
jeisenman23 Dec 10, 2024
4018c59
fixing test
jeisenman23 Dec 20, 2024
4ca2970
fixing test
jeisenman23 Dec 20, 2024
34c58c0
removing whitespace
jeisenman23 Dec 20, 2024
abd7000
fixing space
jeisenman23 Dec 20, 2024
d94403c
finicky flake8 error fix
jeisenman23 Dec 20, 2024
63070ef
fixing elm tests
jeisenman23 Dec 20, 2024
20fb602
ensuring test cases
jeisenman23 Dec 20, 2024
15c417a
reversing - statement
jeisenman23 Dec 20, 2024
0186fdf
removing whitespace
jeisenman23 Dec 20, 2024
d4a9bf0
removing whitespace
jeisenman23 Dec 20, 2024
fc88d7a
fixing line issue
jeisenman23 Dec 27, 2024
878e4f0
fixing osti bug
jeisenman23 Jan 2, 2025
b496352
adding spaces for engineer query
jeisenman23 Jan 2, 2025
991d3e3
adding spaces for chat function
jeisenman23 Jan 2, 2025
af86612
remove trailing whitespaces
jeisenman23 Jan 2, 2025
ae37420
Merge branch 'main' into time
jeisenman23 Jan 2, 2025
3232c02
fixing OSTI bug
jeisenman23 Jan 6, 2025
5334535
removing comments for flake
jeisenman23 Jan 6, 2025
8f4cdcc
making line shorter
jeisenman23 Jan 6, 2025
8d15faa
line too long
jeisenman23 Jan 6, 2025
8348af3
adding blank line
jeisenman23 Jan 6, 2025
667fe05
rerun of actions
jeisenman23 Jan 6, 2025
80be899
changing first
jeisenman23 Jan 6, 2025
564f5d4
attempting to fix osti
jeisenman23 Jan 6, 2025
bc196f1
attempt to fix OSTI in multiple envs
jeisenman23 Jan 6, 2025
022f734
removing test and fixing test
jeisenman23 Jan 6, 2025
371dcc2
inputting local change that works
jeisenman23 Jan 6, 2025
6549134
fixing lint
jeisenman23 Jan 6, 2025
ab7449c
debug statement
jeisenman23 Jan 6, 2025
96c3b87
attempt to fix escape sequence
jeisenman23 Jan 6, 2025
24b41c9
attempting to fix str
jeisenman23 Jan 6, 2025
7f55760
fixing escape
jeisenman23 Jan 6, 2025
4eac5dc
getting get pages to work
jeisenman23 Jan 7, 2025
2ed61b9
clean code
jeisenman23 Jan 7, 2025
7819a13
fixing linter
jeisenman23 Jan 7, 2025
80c92b6
fixing linter
jeisenman23 Jan 7, 2025
0e6b9f9
fixing linter
jeisenman23 Jan 7, 2025
18a4465
fixing over indent
jeisenman23 Jan 7, 2025
ac3e3c5
cleaned up docstrings and removed unnecessary debug kwarg from wizard…
grantbuster Jan 9, 2025
28680a0
Merge pull request #44 from NREL/gb/nodebugkw
jeisenman23 Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ examples/research_hub/pdfs/
examples/research_hub/embed/
examples/research_hub/txt/
examples/research_hub/meta.csv

*ignore*.py

# pixi environments
Expand All @@ -130,4 +131,4 @@ examples/research_hub/meta.csv
pixi*

# Scratch
*scratch*/
*scratch*/
139 changes: 107 additions & 32 deletions elm/web/osti.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
"""
Utilities for retrieving data from OSTI.
"""
import re
import copy
import requests
import json
from typing import Dict, List
import os
import pandas as pd
import logging
Expand All @@ -28,7 +30,9 @@ def __init__(self, record):

@staticmethod
def strip_nested_brackets(text):
"""Remove text between brackets/parentheses for cleaning OSTI text"""
"""
Remove text between brackets/parentheses for cleaning OSTI text
"""
ret = ''
skip1c = 0
skip2c = 0
Expand Down Expand Up @@ -145,7 +149,6 @@ def download(self, fp):
fp : str
Filepath to download this record to, typically a .pdf
"""
# OSTI returns citation on first query and pdf on second (weird)
session = requests.Session()
response = session.get(self.url)
response = session.get(self.url)
Expand All @@ -168,8 +171,8 @@ def __init__(self, url, n_pages=1):
https://www.osti.gov/api/v1/docs
n_pages : int
Number of pages to get from the API. Typical response has 20
entries per page. Default of 1 ensures that this class doesnt hang
on a million responses.
entries per page. Default of 1 ensures that this class doesnt
hang on a million responses.
"""

self.url = url
Expand All @@ -182,30 +185,87 @@ def __init__(self, url, n_pages=1):
records = [OstiRecord(single) for single in records]
super().__init__(records)

def _get_first(self):
"""Get the first page of OSTI records
def clean_escape_sequences(self, text: str) -> str:
"""Clean problematic escape sequences and formatting in JSON text.

Parameters
----------
text : str
Raw JSON text to be cleaned

Returns
-------
list
str
Cleaned JSON text with proper escape sequences and formatting
"""
# First fix any invalid escape sequences
text = re.sub(r'\\([^"\\/bfnrtu])', r'\1', text)

# Handle proper escape sequences
text = text.replace(r'\"', '"')
text = text.replace(r'\/', '/')
text = text.replace(r"\'", "'")
text = text.replace(r'\b', '')
text = text.replace(r'\f', '')
text = text.replace(r'\n', '\n')
text = text.replace(r'\r', '\r')
text = text.replace(r'\t', '\t')

# Cleanup array structure
text = re.sub(r'\}\s*\r?\n\s*\]$', '}]', text)
text = re.sub(r'\]\s*\}\s*\r?\n\s*\]$', ']}]', text)

# Clean newlines between objects
text = re.sub(r'},\s*\r?\n\s*{', '},{', text)

return text.strip()

def parse_json_safely(self, text: str) -> List[Dict]:
"""Safely parse JSON with multiple fallback strategies"""
try:
cleaned_text = self.clean_escape_sequences(text)
return json.loads(cleaned_text)
except json.JSONDecodeError as e1:
logger.debug(f"First parse attempt failed: {e1}")
try:
text = re.sub(r'[\x00-\x1F]+', '', text)
text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
text = re.sub(r'\s+', ' ', text)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woah, this looks like a mess, thanks for fixing this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, escape sequences are no joke.

return json.loads(text)
except json.JSONDecodeError as e2:
logger.debug(f"Second parse attempt failed: {e2}")
try:
matches = re.findall(r'{[^{}]*}', text)
if matches:
valid_json = f"[{','.join(matches)}]"
return json.loads(valid_json)
raise e2
except json.JSONDecodeError as e3:
logger.error(f"""All parsing attempts
failed. Final error: {e3}""")
raise

def _get_first(self):
"""Get the first page of OSTI records"""
self._response = self._session.get(self.url)

if not self._response.ok:
msg = ('OSTI API Request got error {}: "{}"'
.format(self._response.status_code,
self._response.reason))
msg = f'''OSTI API Request got error
{self._response.status_code}:
"{self._response.reason}"'''
raise RuntimeError(msg)
first_page = self._response.json()

try:
raw_text = self._response.text
first_page = self.parse_json_safely(raw_text)
except (json.JSONDecodeError, UnicodeError) as e:
logger.error(f"""JSON decode error:
{str(e)}\nRaw text: {raw_text[:500]}...""")
raise
self._n_pages = 1
if 'last' in self._response.links:
url = self._response.links['last']['url']
self._n_pages = int(url.split('page=')[-1])

logger.debug('Found approximately {} records.'
.format(self._n_pages * len(first_page)))

return first_page

def _get_pages(self, n_pages):
Expand All @@ -219,18 +279,30 @@ def _get_pages(self, n_pages):
Returns
-------
next_pages : list
This function will return a generator of next pages, each of which
is a list of OSTI records
Generator of next pages, each a list of OSTI records
"""
if n_pages > 1:
for page in range(2, self._n_pages + 1):
if page <= n_pages:
next_page = self._session.get(self.url,
params={'page': page})
next_page = next_page.json()
yield next_page
else:
break
if n_pages <= 1:
return
for page in range(2, self._n_pages + 1):
if page > n_pages:
break

try:
response = self._session.get(
self.url,
params={'page': page}
)
if not response.ok:
logger.error(f"""Failed to get page {page}:
{response.status_code}""")
continue
page_records = self.parse_json_safely(response.text)

yield page_records

except Exception as e:
logger.error(f"Error processing page {page}: {str(e)}")
continue

def _get_all(self, n_pages):
"""Get all pages of records up to n_pages.
Expand All @@ -255,24 +327,27 @@ def _get_all(self, n_pages):

def download(self, out_dir):
"""Download all PDFs from the records in this OSTI object into a
directory. PDFs will be given file names based on their OSTI record ID
directory. PDFs will be given file names based on their OSTI record
ID

Parameters
----------
out_dir : str
Directory to download PDFs to. This directory will be created if it
does not already exist.
Directory to download PDFs to. This directory will be created if
it does not already exist.
"""
logger.info('Downloading {} records to: {}'.format(len(self), out_dir))
logger.info(f'Downloading {len(self)} records to: {out_dir}')
os.makedirs(out_dir, exist_ok=True)
for record in self:
fp_out = os.path.join(out_dir, record.osti_id + '.pdf')
if not os.path.exists(fp_out):
try:
record.download(fp_out)
except Exception as e:
logger.exception('Could not download OSTI ID {} "{}": {}'
.format(record.osti_id, record.title, e))
msg = (f'Could not download OSTI ID {record.osti_id} '
f'"{record.title}": {e}')
logger.exception(msg)

logger.info('Finished download!')

@property
Expand Down
Loading
Loading