diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..242c7c8 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.analysis.typeCheckingMode": "off", + "python.analysis.autoImportCompletions": true +} \ No newline at end of file diff --git a/SECURITY.md b/SECURITY.md index 4b8970d..53474a8 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -5,6 +5,7 @@ | Version | Supported | | ------- | ------------------ | +| 1.1.0 | Yes | ## Reporting a Vulnerability diff --git a/art/__init__.py b/art/__init__.py index 093112d..daf6df3 100644 --- a/art/__init__.py +++ b/art/__init__.py @@ -4,6 +4,8 @@ Academic Review Tool (ART) ========================== +Version: 1.1.0 + The Academic Review Tool (ART) is a package for performing academic reviews and bibliometric analyses in Python. It offers capabilities for discovering, retrieving, and analysing academic literature at scale. ART accesses records from Crossref, Web of Science, Scopus, Orcid, and more. @@ -30,6 +32,7 @@ * Geopy / Nominatim """ +from .utils.basics import open_file as open from .importers.crossref import lookup_doi, lookup_dois, lookup_journal, lookup_journals, search_journals, get_journal_entries, search_journal_entries, lookup_funder, lookup_funders, search_funders, get_funder_works, search_funder_works from .importers.crossref import search_works as search_crossref # from .importers.wos import search as search_wos @@ -39,17 +42,4 @@ # from .importers import pdf, orcid, crossref, scopus, jstor, wos from .classes import Results, References, Author, Authors, Funder, Funders, Affiliation, Affiliations, Review from .classes.networks import Network, Networks -from .classes.citation_crawler import academic_scraper as scrape - -import pickle - -def open_file(file_address: str = 'request_input'): # type: ignore - - if file_address == 'request_input': - file_address = input('File address: ') - - if (file_address.endswith('.txt')) or (file_address.endswith('.review')): - with open(file_address, 'rb') as f: - review = pickle.load(f) - - return review \ No newline at end of file +from .classes.citation_crawler import academic_scraper as scrape \ No newline at end of file diff --git a/art/__pycache__/__init__.cpython-39.pyc b/art/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..c133f09 Binary files /dev/null and b/art/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/classes/__pycache__/__init__.cpython-39.pyc b/art/classes/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..5446a24 Binary files /dev/null and b/art/classes/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/classes/__pycache__/activitylog.cpython-39.pyc b/art/classes/__pycache__/activitylog.cpython-39.pyc new file mode 100644 index 0000000..5222445 Binary files /dev/null and b/art/classes/__pycache__/activitylog.cpython-39.pyc differ diff --git a/art/classes/__pycache__/affiliations.cpython-39.pyc b/art/classes/__pycache__/affiliations.cpython-39.pyc new file mode 100644 index 0000000..2bca838 Binary files /dev/null and b/art/classes/__pycache__/affiliations.cpython-39.pyc differ diff --git a/art/classes/__pycache__/attrs.cpython-39.pyc b/art/classes/__pycache__/attrs.cpython-39.pyc new file mode 100644 index 0000000..85fea16 Binary files /dev/null and b/art/classes/__pycache__/attrs.cpython-39.pyc differ diff --git a/art/classes/__pycache__/authors.cpython-39.pyc b/art/classes/__pycache__/authors.cpython-39.pyc new file mode 100644 index 0000000..9aae71d Binary files /dev/null and b/art/classes/__pycache__/authors.cpython-39.pyc differ diff --git a/art/classes/__pycache__/citation_crawler.cpython-39.pyc b/art/classes/__pycache__/citation_crawler.cpython-39.pyc new file mode 100644 index 0000000..d62a042 Binary files /dev/null and b/art/classes/__pycache__/citation_crawler.cpython-39.pyc differ diff --git a/art/classes/__pycache__/entities.cpython-39.pyc b/art/classes/__pycache__/entities.cpython-39.pyc new file mode 100644 index 0000000..7e9db37 Binary files /dev/null and b/art/classes/__pycache__/entities.cpython-39.pyc differ diff --git a/art/classes/__pycache__/funders.cpython-39.pyc b/art/classes/__pycache__/funders.cpython-39.pyc new file mode 100644 index 0000000..dc5bbf8 Binary files /dev/null and b/art/classes/__pycache__/funders.cpython-39.pyc differ diff --git a/art/classes/__pycache__/networks.cpython-39.pyc b/art/classes/__pycache__/networks.cpython-39.pyc new file mode 100644 index 0000000..a7e7786 Binary files /dev/null and b/art/classes/__pycache__/networks.cpython-39.pyc differ diff --git a/art/classes/__pycache__/properties.cpython-39.pyc b/art/classes/__pycache__/properties.cpython-39.pyc new file mode 100644 index 0000000..70a8c8a Binary files /dev/null and b/art/classes/__pycache__/properties.cpython-39.pyc differ diff --git a/art/classes/__pycache__/references.cpython-39.pyc b/art/classes/__pycache__/references.cpython-39.pyc new file mode 100644 index 0000000..c6b2cbf Binary files /dev/null and b/art/classes/__pycache__/references.cpython-39.pyc differ diff --git a/art/classes/__pycache__/results.cpython-39.pyc b/art/classes/__pycache__/results.cpython-39.pyc new file mode 100644 index 0000000..d5cf287 Binary files /dev/null and b/art/classes/__pycache__/results.cpython-39.pyc differ diff --git a/art/classes/__pycache__/review.cpython-39.pyc b/art/classes/__pycache__/review.cpython-39.pyc new file mode 100644 index 0000000..e6d6087 Binary files /dev/null and b/art/classes/__pycache__/review.cpython-39.pyc differ diff --git a/art/classes/activitylog.py b/art/classes/activitylog.py index 41109ca..47bd5cc 100644 --- a/art/classes/activitylog.py +++ b/art/classes/activitylog.py @@ -6,23 +6,24 @@ class ActivityLog(pd.DataFrame): """ - This is a ActivityLog object. It is a modified Pandas Dataframe object designed to store metadata about an academic review. + This is an ActivityLog object. It is a modified Pandas Dataframe object designed to store metadata about an academic review. - Parameters - ---------- - - - Attributes - ---------- + Columns + ------- + * **timestamp**: date-time the activity occurred. + * **type**: type of activity. + * **activity**: details of activity. + * **location**: location in Review that activity occurred. + * **database**: name of database/repository accessed (if relevant). + * **url**: web address accessed (if relevant). + * **query**: search query used (if relevant). + * **changes**: number of changes made to the Review results. """ def __init__(self): """ Initialises ActivityLog instance. - - Parameters - ---------- """ @@ -44,6 +45,27 @@ def __init__(self): def add_activity(self, type: str, activity: str, location: list, database = None, query = None, url = None, changes_dict = None): + """ + Adds a new activity to the ActivityLog DataFrame. + + Parameters + ---------- + type : str + type of activity. + activity : str + details of activity. + location : str + name of location in Review that activity occurred. + database : str + name of database/repository accessed (if relevant). Defaults to None. + query : str + search query used (if relevant). Defaults to None. + url : str + web address accessed (if relevant). Defaults to None. + changes_dict : dict + dictionary of changes made to Review. Defaults to None. + """ + new_index = len(self) self.loc[new_index, 'timestamp'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") self.loc[new_index, 'type'] = type diff --git a/art/classes/affiliations.py b/art/classes/affiliations.py index 11355e1..04b622a 100644 --- a/art/classes/affiliations.py +++ b/art/classes/affiliations.py @@ -13,6 +13,20 @@ def generate_affiliation_id(affiliation_data: pd.Series): + """ + Takes a Pandas Series containing affiliation details and returns a unique identifier code (affiliation ID). + + Parameters + ---------- + affiliation_data : pandas.Series + a series containing affiliation data. + + Returns + ------- + affiliation_id : str + an affiliation ID. + """ + affiliation_data = affiliation_data.copy(deep=True).dropna().astype(str).str.lower() affiliation_id = 'AFFIL:' @@ -82,14 +96,37 @@ def generate_affiliation_id(affiliation_data: pd.Series): class Affiliation(Entity): """ - This is a Affiliation object. It is designed to store data about an organisation that an author is affiliated with. - - Parameters - ---------- + This is a Affiliation object. It is designed to store data about an organisation that an author is affiliated with. + Parameters + ---------- + affiliation_id : str + a unique identifier assigned to the affiliation. Defaults to None. + name : str + the affiliation's name. Defaults to None. + location : str + a street address associted with the affiliation. Defaults to None. + location : str + an place name associted with the affiliation. Defaults to None. + email : str + the affiliation's email address. Defaults to None. + uri : str + a DOI or other unique identifier assigned to the affiliation. Defaults to None. + crossref_id : str + a CrossRef identifier assigned to the affiliation. Defaults to None. + website : str + the affiliation's website. Defaults to None + other_links : str or list + any other links associated with the affiliation. Defaults to None. + use_api : bool + whether to update affiliation data using APIs (e.g. CrossRef). Defaults to None. - Attributes - ---------- + Attributes + ---------- + summary : pandas.DataFrame + a dataframe summarising the Funder's data. + publications : Results + a Results dataframe containing data on the Funder's publications. """ def __init__(self, @@ -106,10 +143,30 @@ def __init__(self, ): """ - Initialises affiliation instance. + Initialises an Affiliation instance. Parameters ---------- + affiliation_id : str + a unique identifier assigned to the affiliation. Defaults to None. + name : str + the affiliation's name. Defaults to None. + location : str + a street address associted with the affiliation. Defaults to None. + location : str + an place name associted with the affiliation. Defaults to None. + email : str + the affiliation's email address. Defaults to None. + uri : str + a DOI or other unique identifier assigned to the affiliation. Defaults to None. + crossref_id : str + a CrossRef identifier assigned to the affiliation. Defaults to None. + website : str + the affiliation's website. Defaults to None + other_links : str or list + any other links associated with the affiliation. Defaults to None. + use_api : bool + whether to update affiliation data using APIs (e.g. CrossRef). Defaults to None. """ super().__init__() @@ -205,6 +262,15 @@ def __init__(self, def generate_id(self): + """ + Returns a unique identifier (affiliation ID) based on the Affiliation's data. + + Returns + ------- + affiliation_id : str + an affiliation ID. + """ + affiliation_data = self.summary.loc[0] affiliation_id = generate_affiliation_id(affiliation_data) # type: ignore @@ -212,6 +278,10 @@ def generate_id(self): def update_id(self): + """ + Replaces the Affiliation's existing unique identifier with a newly generated unique identifier based on the Affiliation's data. + """ + current_id = str(self.summary.loc[0, 'affiliation_id']) if (current_id == None) or (current_id == 'None') or (current_id == '') or (current_id == 'AFFIL:000') or ('no_name_given' in current_id): @@ -222,7 +292,17 @@ def update_id(self): def __getitem__(self, key): """ - Retrieves affiliation attribute using a key. + Retrieves an Affiliation attribute or datapoint using a key. The key may be an attribute name, dataframe index position, or dataframe column name. + + Parameters + ---------- + key : object + an attribute name, dataframe index position, or dataframe column name. + + Returns + ------- + value : object + an object associated with the inputted key. """ if key in self.__dict__.keys(): @@ -233,9 +313,17 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Affiliation objects are represented in string form. + """ + return str(self.summary.loc[0, 'name']) def has_uri(self) -> bool: + + """ + Returns True if the Affilation has a URI associated. Else, returns False. + """ uri = self.summary.loc[0, 'uri'] @@ -246,6 +334,10 @@ def has_uri(self) -> bool: def add_dict(self, data: dict): + """ + Adds a dictionary of affiliation data to the Affiliation's summary dataframe. + """ + if 'name' in data.keys(): name = data['name'] self.summary.loc[0, 'name'] = name @@ -284,6 +376,22 @@ def add_dict(self, data: dict): def from_dict(data: dict, use_api=False): # type: ignore + """ + Takes a dictionary of affiliation data and returns an Affiliation object. + + Parameters + ---------- + data : dict + a dictionary of affiliation data. The dictionary must contain at least one of the following keys: 'name', 'location', 'address', 'email', 'crossref_id', 'DOI', 'URL'. + use_api : bool + whether to update affiliation data using APIs (e.g. CrossRef). Defaults to None. + + Returns + ------- + affiliation : Affiliation + an Affiliation object. + """ + if 'name' in data.keys(): name = data['name'] else: @@ -346,19 +454,64 @@ def from_dict(data: dict, use_api=False): # type: ignore return affiliation def add_series(self, series: pd.Series): + + """ + Adds a Pandas Series object to the Affiliation's summary dataframe. + """ + self.summary.loc[0] = series def from_series(data: pd.Series): # type: ignore + + """ + Takes a Pandas Series and returns an Affiliation object. + + Parameters + ---------- + data : pandas.Series + a Pandas Series with indices that match the names of columns in the Affiliation's summary dataframe. + + Returns + ------- + affiliation : Affiliation + a Affiliation object. + """ + affiliation = Affiliation() affiliation.add_series(data) return affiliation def add_dataframe(self, dataframe: pd.DataFrame): + + """ + Adds data from a Pandas DataFrame to the Affiliation object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Affiliation's summary dataframe. + """ + series = dataframe.loc[0] self.add_series(series) # type: ignore def from_dataframe(data: pd.DataFrame): # type: ignore + + """ + Takes a Pandas DataFrame and returns an Affiliation object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Affiliation object's summary dataframe. + + Returns + ------- + affiliation : Affiliation + a Affiliation object. + """ + affiliation = Affiliation() affiliation.add_dataframe(data) @@ -366,6 +519,15 @@ def from_dataframe(data: pd.DataFrame): # type: ignore def import_crossref_result(self, crossref_result: pd.Series): + """ + Reads a CrossRef API result formatted as a pandas.Series and adds its data to the Affiliation object. + + Parameters + ---------- + crossref_result : pandas.Series + CrossRef API result. + """ + if 'name' in crossref_result.index: name = crossref_result['name'] else: @@ -399,6 +561,20 @@ def import_crossref_result(self, crossref_result: pd.Series): def from_crossref_result(crossref_result: pd.Series, use_api: bool = False): # type: ignore + """ + Reads a CrossRef API result formatted as a pandas.Series and returns as an Affiliation object. + + Parameters + ---------- + crossref_result : pandas.Series. + CrossRef API result. + + Returns + ------- + affiliation : Affiliation + an Affiliation object. + """ + if 'name' in crossref_result.index: name = crossref_result['name'] else: @@ -430,21 +606,78 @@ def from_crossref_result(crossref_result: pd.Series, use_api: bool = False): # t def import_crossref(self, crossref_id: str, timeout = 60): + """ + Looks up a CrossRef affiliation ID and adds the result to the Affiliation object. + + Parameters + ---------- + crossref_id : str + CrossRef funder ID. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + res = lookup_funder(crossref_id, timeout) self.import_crossref_result(res.loc[0]) # type: ignore def from_crossref(crossref_id: str, use_api=True, timeout = 60): # type: ignore + + """ + Looks up a CrossRef affiliation ID and returns the result as a Affiliation object. + + Parameters + ---------- + crossref_result : pandas.Series. + CrossRef API result. + timeout : int + maximum time to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + affiliation : Affiliation + an Affiliation object. + """ + res = lookup_funder(crossref_id, timeout) affiliation = Affiliation.from_crossref_result(crossref_result=res, use_api=use_api) # type: ignore return affiliation def import_uri(self, uri: int, timeout = 60): + + """ + Looks up an affiliation URI using the CrossRef API and adds the result to the Affiliation object. + + Parameters + ---------- + uri : str + affiliation URI. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + uri_str = str(uri) res = lookup_funder(uri_str, timeout) self.import_crossref_result(res.loc[0]) # type: ignore def from_uri(uri: int, use_api=True, timeout = 60): # type: ignore + + """ + Looks up an affiliation URI using the CrossRef API and returns the result as an Affiliation object. + + Parameters + ---------- + uri : str + affiliation URI. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + affiliation : Affiliation + a Affiliation object. + """ + uri_str = str(uri) res = lookup_funder(uri_str, timeout) affiliation = Affiliation.from_crossref_result(crossref_result=res, use_api=use_api) # type: ignore @@ -453,6 +686,10 @@ def from_uri(uri: int, use_api=True, timeout = 60): # type: ignore def update_address(self): + """ + Updates the Affiliation's street address by looking up its name, location, and/or existing address data using geopy. + """ + if self.summary.loc[0, 'name'] != None: name = str(self.summary.loc[0, 'name']).strip().replace('{','').replace('}','').replace('[','').replace(']','').replace(',',' ').replace(' ',' ').strip() else: @@ -491,9 +728,17 @@ def update_address(self): if self.summary.loc[0, 'location'] == None: self.summary.loc[0, 'location'] = loc.display_name - def update_from_crossref(self, timeout = 60): + """ + Looks up the Affiliation's CrossRef ID. If one is found, uses to update the Affiliation object. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + uid = self.summary.loc[0,'crossref_id'] if uid == None: uid = self.summary.loc[0,'uri'] @@ -506,6 +751,15 @@ def update_from_crossref(self, timeout = 60): def update_from_uri(self, timeout = 60): + """ + Looks up the Affiliation's URI using the CrossRef API. If one is found, uses to update the Affiliation object. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + uid = self.summary.loc[0, 'uri'] if uid == None: uid = self.summary.loc[0, 'crossref'] @@ -519,14 +773,21 @@ def update_from_uri(self, timeout = 60): class Affiliations(Entities): """ - This is a Affiliations object. It contains a collection of Affiliations objects and compiles data about them. + This is an Affiliations object. It contains a collection of Affiliation objects and a summary of data about them. Parameters ---------- - + affiliations_data : list or dict + Optional: an iterable of affiliations data. Data on individual affiliations must be formatted as dictionaries. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Affiliations collection's data. + all : dict + a dictionary storing formatted Affiliation objects. + data : list + a list of any unformatted data associated with Affiliation objects in the collection. """ def __init__(self, affiliations_data = None): @@ -536,6 +797,8 @@ def __init__(self, affiliations_data = None): Parameters ---------- + affiliations_data : list or dict + Optional: an iterable of affiliations data. Data on individual affiliations must be formatted as dictionaries. """ super().__init__() @@ -605,7 +868,7 @@ def __init__(self, affiliations_data = None): def __getitem__(self, key): """ - Retrieves affiliations attribute using a key. + Retrieves Affiliations attribute using a key. """ if key in self.__dict__.keys(): @@ -622,14 +885,37 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Affiliations objects are represented in string form. + """ + alphabetical = str(self.summary['name'].sort_values().to_list()).replace('[','').replace(']','') return alphabetical def __len__(self) -> int: + + """ + Returns the number of Affiliation objects in the Affiliations collection. Counts the number of Affiliation objects stored in the Affiliations.all dictionary. + + Returns + ------- + result : int + the number of Affiliation objects contained in the Affiliations.all dictionary. + """ + return len(self.all.keys()) def drop_empty_rows(self): + """ + Drops rows that contain no data from Affiliations.summary dataframe. + + Returns + ------- + self : Affiliations + an Affiliations object. + """ + ignore_cols = ['affiliation_id', 'address', 'email', 'other_links'] df = self.summary.copy(deep=True) @@ -644,6 +930,22 @@ def drop_empty_rows(self): def remove_duplicates(self, drop_empty_rows = True, sync = False): + """ + Removes duplicate Affiliation entries from the Affiliations collection. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to True. + sync : bool + whether to synchronise the Affiliations.summary dataframe with the Affiliations.all dictionary. Defaults to False. + + Returns + ------- + self : Affiliations + an Affiliations object. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -654,13 +956,23 @@ def remove_duplicates(self, drop_empty_rows = True, sync = False): self.summary = deduplicate(self.summary) if sync == True: - self.sync_details(drop_duplicates=False, drop_empty_rows=False) + self.sync_summary(drop_duplicates=False, drop_empty_rows=False) return self - def sync_all(self, drop_duplicates = False, drop_empty_rows=False): + """ + Updates the Affiliations.summary dataframe using the Affiliation objects in the Affiliations.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in self.all.keys(): affil = self.all[i] affil.update_id() @@ -679,7 +991,18 @@ def sync_all(self, drop_duplicates = False, drop_empty_rows=False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def sync_details(self, drop_duplicates = False, drop_empty_rows=False): + def sync_summary(self, drop_duplicates = False, drop_empty_rows=False): + + """ + Updates all Affiliation objects in the Affiliations.all dictionary using the Affiliations.summary dataframe. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ self.update_ids(sync=False) @@ -716,9 +1039,19 @@ def sync_details(self, drop_duplicates = False, drop_empty_rows=False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def sync(self, drop_duplicates = False, drop_empty_rows=False): + """ + Synchronises the Affiliations.summary dataframe with the Affiliation objects in the Affiliations.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -729,20 +1062,37 @@ def sync(self, drop_duplicates = False, drop_empty_rows=False): details_len = len(self.all) if all_len > details_len: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: if details_len > all_len: self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return - def merge(self, affiliations, drop_duplicates = False, drop_empty_rows=True): + """ + Merges the Affiliations collection with another Affiliations collection. + + Parameters + ---------- + affiliations : Affiliations + the Affiliations collection to merge with. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + self : Affiliations + the merged Affiliations collection. + """ + left = self.summary.copy(deep=True) right = affiliations.summary.copy(deep=True) @@ -790,9 +1140,29 @@ def merge(self, affiliations, drop_duplicates = False, drop_empty_rows=True): return self - def add_affiliation(self, affiliation: Affiliation = None, name: str = None, uri: str = None, crossref_id: int = None, data = None, use_api = True, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Adds an Affiliation or affiliation data to the Affiliations collection. + + Parameters + ---------- + affiliation : Affiliation + an Affiliation object to add. + uri : str + a URI identifier to look up. Defaults to None. + crossref_id : str + a CrossRef ID to look up. Defaults to None. + data : dict + Optional: a dictionary containing affiliation data. Dictionary keys must match the names of columns in the Affiliations.summary dataframe. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + use_api : bool + whether to update the Affiliation data using the CrossRef API. Defaults to False. + """ + if type(affiliation) == str: affiliation = Affiliation(name = affiliation, use_api = use_api) @@ -848,9 +1218,23 @@ def add_affiliation(self, affiliation: Affiliation = None, name: str = None, ur self.update_ids() - - def add_list(self, affiliations_list: list, use_api: bool = False, drop_duplicates = False, drop_empty_rows=False): + def add_affiliations_list(self, affiliations_list: list, use_api: bool = False, drop_duplicates = False, drop_empty_rows=False): + """ + Adds a list of Affiliation objects to the Affiliations collection. + + Parameters + ---------- + affiliations_list : list[Affiliation] + a list of Affiliation objects. + use_api : bool + whether to update the Affiliations data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in affiliations_list: if type(i) == Affiliation: self.add_affiliation(affiliation = i, use_api=use_api) @@ -873,6 +1257,19 @@ def add_list(self, affiliations_list: list, use_api: bool = False, drop_duplicat def update_ids(self, sync=False, drop_duplicates = False, drop_empty_rows=False): + """ + Updates affiliation IDs for all rows in the Affiliations.summary dataframe. + + Parameters + ---------- + sync : bool + whether to synchronise the Affiliations.summary dataframe with the Affiliation objects in the Affiliations.all dictionary. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + if sync == True: self.sync() @@ -908,6 +1305,19 @@ def update_ids(self, sync=False, drop_duplicates = False, drop_empty_rows=False) def update_addresses(self, sync=True, drop_duplicates = False, drop_empty_rows=False): + """ + Updates all Affiliations' street addresses by looking up their names, locations, and/or existing addresses data using geopy. + + Parameters + ---------- + sync : bool + whether to synchronise the Affiliations.summary dataframe with the Affiliation objects in the Affiliations.all dictionary. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + if sync == True: self.sync(drop_duplicates=drop_duplicates,drop_empty_rows=drop_empty_rows) @@ -930,6 +1340,17 @@ def update_addresses(self, sync=True, drop_duplicates = False, drop_empty_rows=F def update_from_crossref(self, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up all Affiliations' CrossRef IDs and/or URIs using the CrossRef API. If found, uses to update the Affiliations collection. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + affiliation_ids = self.all.keys() for a in affiliation_ids: @@ -953,9 +1374,21 @@ def update_from_crossref(self, drop_duplicates = False, drop_empty_rows=False): self.update_ids() - def import_crossref_ids(self, crossref_ids: list, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up a list of affiliations' CrossRef IDs and/or URIs using the CrossRef API. Adds any data found to the Affiliations collection. + + Parameters + ---------- + crossref_ids : list[str] + list containing affiliations' CrossRef IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in crossref_ids: auth = Affiliation.from_crossref(i) # type: ignore @@ -969,28 +1402,91 @@ def import_crossref_ids(self, crossref_ids: list, drop_duplicates = False, drop_ self.update_ids() - def from_crossref_ids(crossref_ids: list, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Looks up a list of affiliations' CrossRef IDs and/or URIs using the CrossRef API. Returns all data found as an Affiliations object. + + Parameters + ---------- + crossref_ids : list[str] + list containing affiliations' CrossRef IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + funders : Funders + a Funders object. + """ + affiliations = Affiliations() affiliations.import_crossref_ids(crossref_ids, drop_empty_rows=drop_empty_rows, drop_duplicates=drop_duplicates) return affiliations - def with_crossref(self): + def has_crossref(self): + + """ + Returns all rows in Affiliations.summary which contain CrossRef IDs. + """ + return self.summary[~self.summary['crossref_id'].isna()] - def with_uri(self): + def has_uri(self): + + """ + Returns all rows in Affiliations.summary which contain URIs. + """ + return self.summary[~self.summary['uri'].isna()] - def from_list(affiliations_list: list, use_api: bool = False, drop_duplicates = False, drop_empty_rows=False): # type: ignore + def from_affiliations_list(affiliations_list: list, use_api: bool = False, drop_duplicates = False, drop_empty_rows=False): # type: ignore + + """ + Reads a list of Affiliation objects and returns as an Affiliations object. + + Parameters + ---------- + affiliations_list : list[Affiliation] + a list of Affiliation objects. + use_api : bool + whether to update the Affiliations data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + affiliations : Affiliations + an Affiliations object. + """ + affiliations = Affiliations() - affiliations.add_list(affiliations_list, use_api=use_api, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + affiliations.add_affiliations_list(affiliations_list, use_api=use_api, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return affiliations def import_crossref_result(self, crossref_result: pd.DataFrame, use_api = False, drop_duplicates = False, drop_empty_rows=False): + """ + Reads a pandas.DataFrame containing CrossRef API results and adds the data to the Affiliations collection. + + Parameters + ---------- + crossref_result : pandas.Dataframe + CrossRef API results. + use_api : bool + whether to update the Affiliations data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in crossref_result.index: data = crossref_result.loc[i] @@ -999,9 +1495,28 @@ def import_crossref_result(self, crossref_result: pd.DataFrame, use_api = False, self.update_ids() - def from_crossref_result(crossref_result: pd.DataFrame, use_api: bool = False, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Reads a pandas.DataFrame containing CrossRef API results and returns as a Affiliations object. + + Parameters + ---------- + crossref_result : pandas.Dataframe + CrossRef API results. + use_api : bool + whether to update the Funders data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + affiliations : Affiliations + a Affiliations object. + """ + affiliations = Affiliations() affiliations.import_crossref_result(crossref_result, use_api=use_api, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -1009,6 +1524,21 @@ def from_crossref_result(crossref_result: pd.DataFrame, use_api: bool = False, d def format_affiliations(affiliation_data, use_api = False, drop_duplicates = False, drop_empty_rows=True): + """ + Formats a collection of affiliations data as an Affiliations object. + + Parameters + ---------- + affiliation_data : object + a collection of affiliations data. + use_api : bool + whether to update the Affiliations data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + result = Affiliations() affil_type = type(affiliation_data) @@ -1074,7 +1604,7 @@ def format_affiliations(affiliation_data, use_api = False, drop_duplicates = Fal if (affil_type == list) and (len(affiliation_data) > 0) and (type(affiliation_data[0]) == Affiliation): result = Affiliations() - result.add_list(affiliation_data) + result.add_affiliations_list(affiliation_data) if drop_empty_rows == True: result.drop_empty_rows() if drop_duplicates == True: diff --git a/art/classes/attrs.py b/art/classes/attrs.py index 628b170..80f4925 100644 --- a/art/classes/attrs.py +++ b/art/classes/attrs.py @@ -171,7 +171,7 @@ def export_folder(self, folder_name = 'obj_name', folder_address = 'request_inpu class AttrSet(Attr): """ - This is a collection of Attrs. + This is a collection of Attr objects. Notes ----- diff --git a/art/classes/authors.py b/art/classes/authors.py index f9d6a7b..0c17b18 100644 --- a/art/classes/authors.py +++ b/art/classes/authors.py @@ -27,7 +27,21 @@ 'other_ids' ] -def get_full_name(series: pd.Series): +def get_full_name(series: pd.Series) -> str: + + """ + Takes a Pandas Series containing author details and returns the author's name data as a string in full name ({given_name} {family_name}") format. + + Parameters + ---------- + series : pandas.Series + a series containing author data + + Returns + ------- + result : str + a full name. + """ given = series.loc['given_name'] family = series.loc['family_name'] @@ -60,7 +74,21 @@ def get_full_name(series: pd.Series): return result -def generate_author_id(author_data: pd.Series): +def generate_author_id(author_data: pd.Series) -> str: + + """ + Takes a Pandas Series containing author details and returns a unique identifier code. + + Parameters + ---------- + author_data : pandas.Series + a series containing author data + + Returns + ------- + author_id : str + an author ID. + """ author_data = author_data.copy(deep=True).dropna().astype(str).str.lower() @@ -153,10 +181,37 @@ class Author(Entity): Parameters ---------- - + author_id : str + a unique identifier assigned to the author. Defaults to None. + full_name : str + the author's name in full name ("{given_name} {family_name}") format. Defaults to None. + given_name : str + the author's given name or first name. Defaults to None. + family_name : str + the author's family name or last name. Defaults to None. + email : str + the author's email address. Defaults to None. + affiliations : list or dict or Affiliations + data on the author's organisational affiliations. May be a list, dict, or Affiliations object. Defaults to None. + publications : list or dict or pandas.DataFrame or Results + data on the author's publications. May be a list, dict, Pandas DataFrame or Results object. Defaults to None. + orcid : str + an Orcid identifier assigned to the author. Defaults to None. + google_scholar : str + a Google Scholar identifier assigned to the author. Defaults to None. + scopus : str + an Scopus identifier assigned to the author. Defaults to None. + crossref : str + an CrossRef identifier assigned to the author. Defaults to None. + other_links : str or list + any other links associated with the author. Defaults to None. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Author's data. + publications : Results + a Results dataframe containing data on the Author's publications. """ def __init__(self, @@ -179,6 +234,30 @@ def __init__(self, Parameters ---------- + author_id : str + a unique identifier assigned to the author. Defaults to None. + full_name : str + the author's full name formatted as "{given_name} {family_name}". Defaults to None. + given_name : str + the author's given name or first name. Defaults to None. + family_name : str + the author's family name or last name. Defaults to None. + email : str + the author's email address. Defaults to None. + affiliations : list or dict or Affiliations + data on the author's organisational affiliations. May be a list, dict, or Affiliations object. Defaults to None. + publications : list or dict or pandas.DataFrame or Results + data on the author's publications. May be a list, dict, Pandas DataFrame or Results object. Defaults to None. + orcid : str + an Orcid identifier assigned to the author. Defaults to None. + google_scholar : str + a Google Scholar identifier assigned to the author. Defaults to None. + scopus : str + an Scopus identifier assigned to the author. Defaults to None. + crossref : str + an CrossRef identifier assigned to the author. Defaults to None. + other_links : str or list + any other links associated with the author. Defaults to None. """ super().__init__() @@ -215,7 +294,7 @@ def __init__(self, self.summary.loc[0, 'crossref'] = crossref self.summary.loc[0, 'other_links'] = other_links - full_name = self.get_full_name() + full_name = self.full_name() if full_name != self.summary.loc[0, 'full_name']: self.summary.loc[0, 'full_name'] = full_name @@ -223,6 +302,15 @@ def __init__(self, def generate_id(self): + """ + Returns a unique identifier based on the Author's data. + + Returns + ------- + author_id : str + an author identifier. + """ + author_data = self.summary.loc[0] author_id = generate_author_id(author_data) # type: ignore @@ -230,6 +318,10 @@ def generate_id(self): def update_id(self): + """ + Replaces the Author's existing unique identifier with a newly generated unique identifier based on the Author's data. + """ + current_id = self.summary.loc[0, 'author_id'] new_id = self.generate_id() @@ -237,10 +329,20 @@ def update_id(self): self.summary.loc[0, 'author_id'] = new_id - def __getitem__(self, key): + def __getitem__(self, key) -> object: """ - Retrieves Author attribute using a key. + Retrieves an Author attribute or datapoint using a key. The key may be an attribute name, dataframe index position, or dataframe column name. + + Parameters + ---------- + key : object + an attribute name, dataframe index position, or dataframe column name. + + Returns + ------- + value : object + an object associated with the inputted key. """ if key in self.__dict__.keys(): @@ -253,20 +355,43 @@ def __getitem__(self, key): return self.publications[key] def __repr__(self) -> str: + + """ + Defines how Author objects are represented in string form. + """ + return str(self.summary.loc[0, 'full_name']) - def get_full_name(self): + def full_name(self) -> str: + + """ + Returns the author's name data as a string in full name ("{given_name} {family_name}") format. + + Returns + ------- + result : str + the Author's name data in full name ("{given_name} {family_name}") format. + """ + series = self.summary.loc[0] return get_full_name(series=series) # type: ignore def update_full_name(self): - full_name = self.get_full_name() + """ + Updates the Author's full name entry using the current Author data. + """ + + full_name = self.full_name() self.summary.loc[0, 'full_name'] = full_name def name_set(self) -> set: + """ + Returns the Author's given name and family name as a set. + """ + given = str(self.summary.loc[0, 'given_name']) family = str(self.summary.loc[0, 'family_name']) @@ -274,6 +399,10 @@ def name_set(self) -> set: def has_orcid(self) -> bool: + """ + Returns True if the Author has an Orcid ID associated. Else, returns False. + """ + orcid = self.summary.loc[0, 'orcid'] if (type(orcid) == str) and (orcid != ''): @@ -283,29 +412,87 @@ def has_orcid(self) -> bool: def format_affiliations(self): + """ + Formats the Author's affiliations data as an Affiliations object. + """ + affils_data = self.summary.loc[0, 'affiliations'] affiliations = format_affiliations(affils_data) self.summary.at[0, 'affiliations'] = affiliations def add_series(self, series: pd.Series): + + """ + Adds a Pandas Series object to the Author's summary data. + """ + self.summary.loc[0] = series def from_series(series: pd.Series): # type: ignore + + """ + Takes a Pandas Series and returns an Author object. + + Parameters + ---------- + series : pandas.Series + a Pandas Series with indices that match the names of columns in the Author summary dataframe. + + Returns + ------- + author : Author + an Author object. + """ + author = Author() author.add_series(series) return author def add_dataframe(self, dataframe: pd.DataFrame): + + """ + Adds data from a Pandas DataFrame to the Author object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Author's summary dataframe. + """ + series = dataframe.loc[0] self.add_series(series) # type: ignore def from_dataframe(dataframe: pd.DataFrame): # type: ignore + + """ + Takes a Pandas DataFrame and returns an Author object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Author's summary dataframe. + + Returns + ------- + author : Author + an Author object. + """ + author = Author() author.add_dataframe(dataframe) return author def import_crossref(self, crossref_result: dict): + """ + Reads a CrossRef API result formatted as a dictionary and adds its data to the Author object. + + Parameters + ---------- + crossref_result : dict + CrossRef API result formatted as a dictionary. + """ + if 'given' in crossref_result.keys(): self.summary.loc[0, 'given_name'] = crossref_result['given'] @@ -339,6 +526,15 @@ def import_crossref(self, crossref_result: dict): def import_orcid(self, orcid_id: str): + """ + Looks up an author record in the ORCID API using an ORCID author ID. If one is found, adds its data to the Author object. + + Parameters + ---------- + orcid_id : str + ORCID author ID. + """ + try: auth_res = get_author(orcid_id) auth_record = auth_res.record() @@ -472,6 +668,20 @@ def import_orcid(self, orcid_id: str): def from_crossref(crossref_result: dict): # type: ignore + """ + Reads a CrossRef API result formatted as a dictionary and returns as an Author object. + + Parameters + ---------- + crossref_result : dict + CrossRef API result formatted as a dictionary. + + Returns + ------- + author : Author + an Author object. + """ + author = Author() author.import_crossref(crossref_result) author.update_full_name() @@ -480,6 +690,20 @@ def from_crossref(crossref_result: dict): # type: ignore def from_orcid(orcid_id: str): # type: ignore + """ + Looks up an author record in the ORCID API using an ORCID author ID. If one is found, returns as an Author object. + + Parameters + ---------- + orcid_id : str + ORCID author ID. + + Returns + ------- + author : Author + an Author object. + """ + author = Author() author.import_orcid(orcid_id) author.update_full_name() @@ -488,34 +712,49 @@ def from_orcid(orcid_id: str): # type: ignore def update_from_orcid(self): + """ + Looks up Author's ORCID author ID. If one is found, uses to update the Author object. + """ + orcid = self.summary.loc[0, 'orcid'] if (orcid != None) and (orcid != '') and (orcid != 'None'): orcid = str(orcid).replace('https://', '').replace('http://', '').replace('orcid.org/', '') + self.summary.loc[0, 'orcid'] = orcid self.import_orcid(orcid_id = orcid) class Authors(Entities): """ - This is an Authors object. It contains a collection of Authors objects and compiles data about them. + This is an Authors object. It contains a collection of Author objects and a summary of data about them. Parameters ---------- - + authors_data : list or dict + Optional: an iterable of authors data. Data on individual authors must be formatted as a dictionary. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Authors collection's data. + all : dict + a dictionary storing formatted Author objects. + data : list + a list of any unformatted data associated with Author objects in the collection. """ - def __init__(self, authors_data = None): + def __init__(self, authors_data = []): """ Initialises Authors instance. Parameters ---------- + authors_data : list or dict + Optional: an iterable of authors data. Data on individual authors must be formatted as a dictionary. + """ super().__init__() @@ -524,13 +763,9 @@ def __init__(self, authors_data = None): self.summary = pd.DataFrame(columns = author_cols, dtype = object) + self.data = authors_data - self.all = dict() - - self.data = [] - self.data.append(authors_data) - - if (type(authors_data) == list) and (type(authors_data[0]) == Author): + if (type(authors_data) == list) and (len(authors_data)>0) and (type(authors_data[0]) == Author): for i in authors_data: auth = i.summary.copy(deep=True) @@ -577,14 +812,44 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Authors objects are represented in string form. + """ + alphabetical = self.summary['full_name'].sort_values().to_list().__repr__() return alphabetical def __len__(self) -> int: + + """ + Returns the number of Author objects in the Authors collection. Uses the number of Author objects stored in the Authors.all dictionary. + + Returns + ------- + result : int + the number of Author objects contained in the Authors.all dictionary. + """ + return len(self.all.keys()) def remove_duplicates(self, drop_empty_rows = True, sync=True): + """ + Removes duplicate Author entries. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to True. + sync : bool + whether to synchronise the Authors.summary dataframe with the Authors.all dictionary. Defaults to True. + + Returns + ------- + self : Authors + an Authors object. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -596,7 +861,7 @@ def remove_duplicates(self, drop_empty_rows = True, sync=True): self.summary = deduplicate(self.summary) if sync == True: - self.sync_details() + self.sync_summary() return self @@ -604,6 +869,24 @@ def remove_duplicates(self, drop_empty_rows = True, sync=True): def merge(self, authors, drop_duplicates = False, drop_empty_rows=False): + """ + Merges the Authors collection with another Authors collection. + + Parameters + ---------- + authors : Authors + the Authors collection to merge with. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + self : Authors + the merged Authors collection. + """ + left = self.summary.copy(deep=True) right = authors.summary.copy(deep=True) @@ -651,6 +934,10 @@ def merge(self, authors, drop_duplicates = False, drop_empty_rows=False): def update_full_names(self): + """ + Checks all Author entries for name data and uses this to update the 'full_name' field. + """ + for i in self.summary.index: new = get_full_name(self.summary.loc[i]) old = self.summary.loc[i, 'full_name'] @@ -659,10 +946,27 @@ def update_full_names(self): self.summary.loc[i, 'full_name'] = new self.update_author_ids() - self.sync_details() + self.sync_summary() def add_author(self, author: Author, data = None, drop_duplicates = False, drop_empty_rows = False, update_from_orcid = False): + """ + Adds an Author or author data to the Authors collection. + + Parameters + ---------- + author : Author + an Author object to add. + data : dict + Optional: a dictionary containing author data. Dictionary keys must match the names of columns in the Authors.summary dataframe. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + update_from_orcid : bool + whether to update the Author data using the ORCID API (if an ORCID ID is present). Defaults to False. + """ + if update_from_orcid == True: orcid = author.summary.loc[0,'orcid'] if (orcid != None) and (orcid != '') and (orcid != 'None'): @@ -699,6 +1003,19 @@ def add_author(self, author: Author, data = None, drop_duplicates = False, drop_ def add_authors_list(self, authors_list: list, drop_duplicates = False, drop_empty_rows = False): + """ + Adds a list containing Author objects to the Authors collection. + + Parameters + ---------- + authors_list : list[Author] + a list of Author objects. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in authors_list: if type(i) == Author: self.add_author(author = i, drop_duplicates=False) @@ -711,6 +1028,24 @@ def add_authors_list(self, authors_list: list, drop_duplicates = False, drop_emp def mask_entities(self, column, query: str = 'request_input', ignore_case: bool = True): + """ + Selects rows in Authors.summary dataframe with affiliations and funders that contain a query string. + + Parameters + ---------- + column : str + name of column to mask. + query : str + a string to check for. Defaults to requesting from user input. + ignore_case : bool + whether to ignore the case of string data. Defaults to True. + + Returns + ------- + masked : pandas.DataFrame + selected rows from the Authors.summary dataframe. + """ + if query == 'request_input': query = input('Search query').strip() @@ -731,13 +1066,30 @@ def entity_masker(entities): return masked def update_author_ids(self): + + """ + Updates author IDs for all rows in the Authors.summary dataframe. + """ for i in self.summary.index: author_data = self.summary.loc[i] author_id = generate_author_id(author_data) self.summary.loc[i, 'author_id'] = author_id + + def sync_all(self, drop_duplicates = False, drop_empty_rows=False): + + """ + Updates the Authors.summary dataframe using the Author objects in the Authors.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ for i in self.all.keys(): author = self.all[i] @@ -753,7 +1105,18 @@ def sync_all(self, drop_duplicates = False, drop_empty_rows=False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def sync_details(self, drop_duplicates = False, drop_empty_rows=False): + def sync_summary(self, drop_duplicates = False, drop_empty_rows=False): + + """ + Updates all Author objects in the Authors.all dictionary using the Authors.summary dataframe. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ self.update_author_ids() @@ -786,24 +1149,44 @@ def sync_details(self, drop_duplicates = False, drop_empty_rows=False): def sync(self, drop_duplicates = False, drop_empty_rows=False): + """ + Synchronises the Authors.summary dataframe with the Author objects in the Authors.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + all_len = len(self.summary) details_len = len(self.all) if all_len > details_len: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: if details_len > all_len: self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return def drop_empty_rows(self): + """ + Drops rows that contain no data from Authors.summary dataframe. + + Returns + ------- + self : Authors + an Authors object. + """ + ignore_cols = ['author_id', 'affiliations', 'publications', 'other_links'] df = self.summary.copy(deep=True) @@ -813,21 +1196,41 @@ def drop_empty_rows(self): df = df.dropna(axis=0, how='all', subset=drop_cols).reset_index().drop('index', axis=1) self.summary = df - self.sync_details() + self.sync_summary() return self def format_affiliations(self, drop_empty_rows=False): + """ + Formats authors' affiliations data as Affiliations objects and stores in Review's Affiliations attribute. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + """ + if drop_empty_rows == True: self.drop_empty_rows() affils = self.summary['affiliations'].apply(func=format_affiliations) # type: ignore self.summary['affiliations'] = affils - self.sync_details() + self.sync_summary() def update_from_orcid(self, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up all Authors ORCID author IDs and, if found, uses to update the Authors collection. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + self.sync() author_ids = self.all.keys() @@ -853,6 +1256,19 @@ def update_from_orcid(self, drop_duplicates = False, drop_empty_rows=False): def import_orcid_ids(self, orcid_ids: list, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up a list of ORCID author IDs using the ORCID API and adds any data found to the Authors collection. + + Parameters + ---------- + orcid_ids : list[str] + list containing ORCID author IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in orcid_ids: auth = Author.from_orcid(i) # type: ignore @@ -866,16 +1282,52 @@ def import_orcid_ids(self, orcid_ids: list, drop_duplicates = False, drop_empty_ def from_orcid_ids(orcid_ids: list, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Looks up a list of ORCID author IDs using the ORCID API and returns all data found as an Authors object. + + Parameters + ---------- + orcid_ids : list[str] + list containing ORCID author IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + authors : Authors + an Authors object. + """ + authors = Authors() authors.import_orcid_ids(orcid_ids, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return authors - def with_orcid(self): + def has_orcid(self): + + """ + Returns all rows in Authors.summary which contain ORCID IDs. + """ + return self.summary[~self.summary['orcid'].isna()] def import_crossref(self, crossref_result: list, drop_duplicates = False, drop_empty_rows=False): + """ + Reads a list of CrossRef API results and adds the data to the Authors collection. + + Parameters + ---------- + crossref_result : list + list of CrossRef API results. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in crossref_result: auth = Author.from_crossref(i) # type: ignore @@ -889,6 +1341,24 @@ def import_crossref(self, crossref_result: list, drop_duplicates = False, drop_e def from_crossref(crossref_result: list, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Reads a list of CrossRef API results and returns as an Authors object. + + Parameters + ---------- + crossref_result : list + list of CrossRef API results. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + authors : Authors + an Authors object. + """ + authors = Authors() authors.import_crossref(crossref_result, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -896,6 +1366,19 @@ def from_crossref(crossref_result: list, drop_duplicates = False, drop_empty_row def import_wos(self, wos_result, drop_duplicates = False, drop_empty_rows=False): + """ + Reads a Web of Science API result and adds the data to the Authors collection. + + Parameters + ---------- + wos_result : list or dict + list of Web of Science API results. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + authors_data = [] if type(wos_result) == list: @@ -946,18 +1429,42 @@ def import_wos(self, wos_result, drop_duplicates = False, drop_empty_rows=False) if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - self.sync_details(drop_duplicates=False, drop_empty_rows=False) + self.sync_summary(drop_duplicates=False, drop_empty_rows=False) def from_wos(wos_result, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Reads a Web of Science API result and returns as an Authors collection. + + Parameters + ---------- + wos_result : list or dict + list of Web of Science API results. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + authors : Authors + an Authors object. + """ + authors = Authors() authors.import_wos(wos_result=wos_result, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return authors - def affiliations(self, drop_duplicates = False, drop_empty_rows=False): + def affiliations(self, drop_duplicates = False, drop_empty_rows=False) -> dict: + + """ + Returns a dictionary containing authors and their associated affiliations. + * Keys: author IDs + * Values: Affiliations objects + """ - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) output = {} for auth_id in self.all.keys(): @@ -969,6 +1476,23 @@ def affiliations(self, drop_duplicates = False, drop_empty_rows=False): def search_orcid(self, query: str = 'request_input', add_to_authors: bool = True): + """ + Searches for author records using the Orcid API. + + Parameters + ---------- + query : str + query to search. Allows for keywords and Boolean logic. + add_to_authors : bool + whether to add results to the Authors collection. + + Returns + ------- + result : pandas.DataFrame + search result. + """ + + res = search_orcid(query=query) res = res.rename(columns={'credit-name': 'full_name', 'given-names': 'given_name', 'family-name': 'family_name', 'family-names': 'family_name', 'institution-name': 'affiliations', 'orcid-id': 'orcid'}) # type: ignore @@ -984,6 +1508,19 @@ def search_orcid(self, query: str = 'request_input', add_to_authors: bool = True def format_authors(author_data, drop_duplicates = False, drop_empty_rows=False): + """ + Formats a collection of authors data as an Authors object. + + Parameters + ---------- + author_data : object + a collection of authors data. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + result = Authors() if (author_data == None) or (author_data == ''): diff --git a/art/classes/citation_crawler.py b/art/classes/citation_crawler.py index 2c512dd..4788b05 100644 --- a/art/classes/citation_crawler.py +++ b/art/classes/citation_crawler.py @@ -4,7 +4,8 @@ from ..internet.crawlers import check_crawl_permission from ..internet.crawlers import correct_seed_errors as correct_seed_url_errors -from .references import extract_references +from .authors import format_authors +from .references import format_references import queue import time @@ -12,9 +13,12 @@ import pandas as pd import numpy as np - def crawler_scrape_url(url) -> pd.DataFrame: + """ + Core functionality for the citation crawler's web scraper. Takes a URL and returns a Pandas DataFrame. + """ + scrape_res = scrape_url(url=url) global results_cols @@ -110,6 +114,10 @@ def crawler_scrape_url(url) -> pd.DataFrame: def citation_crawler_site_test(url: str): + """ + Checks whether the citation crawler can crawl a given URL. Returns True if yes; False if no. + """ + global can_scrape for i in can_scrape: @@ -120,6 +128,22 @@ def citation_crawler_site_test(url: str): def academic_scraper(url, be_polite = False): + """ + Bespoke web scraper for academic repository websites. + + Parameters + ---------- + url : str + a URL to scrape. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + + Returns + ------- + res_df : pandas.DataFrame + a Pandas DataFrame containing scraped web data. + """ + # Checking if URL is bad. If True, tries to correct it. url = correct_seed_url_errors(url) domain = get_domain(url) @@ -159,6 +183,22 @@ def academic_scraper(url, be_polite = False): def citation_crawler_scraper(entry: pd.Series, be_polite = True): + """ + Bespoke web scraper for use by citation crawler. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + url = entry['link'] res_df = academic_scraper(url=url, be_polite=be_polite) @@ -172,6 +212,24 @@ def citation_crawler_scraper(entry: pd.Series, be_polite = True): def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout = 60): + """ + Takes citation crawler entry. If it contains a DOI, looks up the record using the CrossRef API. If not, scrapes the URL. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + doi = entry['doi'] link = entry['link'] @@ -220,6 +278,24 @@ def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout = def update_citation_crawler_data(entry: pd.Series, be_polite = True, timeout = 60): + """ + Takes citation crawler entry and updates the data using the CrossRef API if a record is available. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + doi = entry['doi'] link = entry['link'] @@ -265,32 +341,27 @@ def citation_crawler_engine( Parameters ---------- - urls : queue - ordered queue of URLs to be crawled. - required_keywords : list - list of keywords which sites must contain to be crawled. - excluded_keywords : list - list of keywords which sites must *not* contain to be crawled. - excluded_url_terms : list - list of strings; link will be ignored if it contains any string in list. - case_sensitive : bool - whether or not to ignore string characters' case. + to_crawl : queue + records to crawl. + data : pandas.DataFrame + a dataframe of data gathered by the crawler. + use_api : bool + whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True. crawl_limit : int how many URLs the crawler should visit before it stops. - ignore_urls : list - list of URLs to ignore. - ignore_domains : list - list of domains to ignore. + depth_limit : int + maximum number of crawler iterations to perform. be_polite : bool whether to respect websites' permissions for crawlers. - full : bool - whether to run a full scrape on each site. This takes longer. - + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. Returns ------- - output_dict : dict - a dictionary containing results from each crawled site. + data : pandas.DataFrame + a Pandas DataFrame containing results from the crawl. """ # Intiailising variables to store the pages already visited @@ -337,7 +408,7 @@ def citation_crawler_engine( # Formatting entry citations data - refs = extract_references(entry['citations_data'], add_work_ids = True, update_from_doi = False) + refs = format_references(entry['citations_data'], add_work_ids = True, update_from_doi = False) entry.at['citations'] = refs # Formatting entry authors data @@ -410,34 +481,25 @@ def citation_crawler( Parameters ---------- - seeds : str or list - one or more URLs from which to crawl. + data : pandas.DataFrame + a dataframe of data gathered by the crawler. + use_api : bool + whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True. crawl_limit : int - how many URLs the crawler should visit before it stops. - excluded_url_terms : list - list of strings; link will be ignored if it contains any string in list. - required_keywords : list - list of keywords which sites must contain to be crawled. - excluded_keywords : list - list of keywords which sites must *not* contain to be crawled. - case_sensitive : bool - whether or not to ignore string characters' case. - ignore_urls : list - list of URLs to ignore. - ignore_domains : list - list of domains to ignore. + how many records the crawler should visit before it stops. + depth_limit : int + maximum number of crawler iterations to perform. be_polite : bool - whether respect websites' permissions for crawlers. - full : bool - whether to run a full scrape on each site. This takes longer. - output_as : str - the format to output results in. Defaults to a pandas.DataFrame. - + whether to respect websites' permissions for crawlers. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. Returns ------- - result : pd.DataFrame - an object containing the results of a crawl. + output : pd.DataFrame + an object containing the results from the crawl. """ # See https://www.zenrows.com/blog/web-crawler-python#transitioning-to-a-real-world-web-crawler diff --git a/art/classes/entities.py b/art/classes/entities.py index b51d5fe..db630eb 100644 --- a/art/classes/entities.py +++ b/art/classes/entities.py @@ -1,28 +1,22 @@ -from ..exporters.general_exporters import obj_to_folder, art_class_to_folder +from ..exporters.general_exporters import art_class_to_folder import pandas as pd -import numpy as np class Entity: """ - This is an Entity object. It is designed to store data about an individual entity and their publications. - - Parameters - ---------- - + This is an Entity object. It is intended as a superclass for Author, Funder, and Affiliation classes. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Entity's data. """ def __init__(self): """ - Initialises entity instance. - - Parameters - ---------- + Initialises Entity instance. """ @@ -33,7 +27,7 @@ def __init__(self): def __getitem__(self, key): """ - Retrieves entity attribute using a key. + Retrieves Entity attribute using a key. """ if key in self.__dict__.keys(): @@ -43,19 +37,50 @@ def __getitem__(self, key): return self.summary.loc[0, key] def get(self, key): + + """ + Retrieves Entity attribute using a key. + """ + return self[key] def __repr__(self) -> str: + + """ + Defines how Entity objects are represented in string form. + """ + return str(self.summary.loc[0]) - def search(self, query: str = 'request_input'): + def search(self, query: str = 'request_input', ignore_case: bool = True) -> pd.Series: + + """ + Searches Entity's summary data containing a matching string. + + Parameters + ---------- + query : str + a string to search for. Returns results where *any* matches are found. Defaults to requesting from user input. + ignore_case : bool + whether to ignore the case of string data. Defaults to True. + + Returns + ------- + output : pandas.Series + search results. + """ if query == 'request_input': query = input('Search query').strip() query = query.strip().lower() - self_str = self.summary.copy(deep=True).loc[0].astype(str).str.lower() + self_str = self.summary.copy(deep=True).loc[0].astype(str) + + if ignore_case == True: + query = query.lower() + self_str = self_str.str.lower() + masked = self_str[self_str.str.contains(query)].index return self.summary.loc[0][masked] @@ -63,6 +88,10 @@ def search(self, query: str = 'request_input'): def has_uri(self) -> bool: + """ + Returns True if the Entity has a URI associated. + """ + if 'uri' in self.summary.columns: uri = self.summary.loc[0, 'uri'] @@ -75,9 +104,22 @@ def has_uri(self) -> bool: def add_dict(self, data: dict): - if 'name' in data.keys(): - name = data['name'] - self.summary.loc[0, 'name'] = name + """ + Adds data from a dictionary to the Entity object. + + Parameters + ---------- + data : dict + a dictionary with keys that match the names of columns in the Entity's summary dataframe. + """ + + cols = self.summary.columns.to_list() + + for c in cols: + + if c in data.keys(): + value = data[c] + self.summary.loc[0, c] = value if 'DOI' in data.keys(): uri = data['DOI'].replace('http', '').replace('https', '').replace('dx.', '').replace('doi.org/', '').strip() @@ -85,25 +127,89 @@ def add_dict(self, data: dict): def from_dict(data: dict): # type: ignore + """ + Takes a dictionary and returns an Entity. + + Parameters + ---------- + data : dict + a dictionary with keys that match the names of columns in the Entity's summary dataframe. + + Returns + ------- + entity : Entity + an Entity object. + """ + entity = Entity() entity.add_dict(data=data) return entity def add_series(self, series: pd.Series): + + """ + Adds data from a Pandas Series to the Entity object. + + Parameters + ---------- + series : pandas.Series + a Pandas Series with indices that match the names of columns in the Entity's summary dataframe. + """ + self.summary.loc[0] = series def from_series(data: pd.Series): # type: ignore + + """ + Takes a Pandas Series and returns an Entity. + + Parameters + ---------- + data : pandas.Series + a Pandas Series with indices that match the names of columns in the Entity's summary dataframe. + + Returns + ------- + entity : Entity + an Entity object. + """ + entity = Entity() entity.add_series(data) return entity def add_dataframe(self, dataframe: pd.DataFrame): + + """ + Adds data from a Pandas DataFrame to the Entity object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Entity's summary dataframe. + """ + series = dataframe.loc[0] self.add_series(series) # type: ignore def from_dataframe(data: pd.DataFrame): # type: ignore + + """ + Takes a Pandas DataFrame and returns an Entity. + + Parameters + ---------- + data : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Entity's summary dataframe. + + Returns + ------- + entity : Entity + an Entity object. + """ + entity = Entity() entity.add_dataframe(data) @@ -116,31 +222,68 @@ def export_folder(self, export_dict_as: str = 'json', export_pandas_as: str = 'csv', export_network_as: str = 'graphML' - ): + ): + + """ + Exports the Entity object's contents to a folder. + Parameters + ---------- + folder_name : str + name of folder to create. Defaults to requesting from user input. + folder_address : str + directory address to create folder in. defaults to requesting for user input. + export_str_as : str + file type for exporting string objects. Defaults to 'txt'. + export_dict_as : str + file type for exporting dictionary objects. Defaults to 'json'. + export_pandas_as : str + file type for exporting Pandas objects. Defaults to 'csv'. + export_network_as : str + file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) + """ + art_class_to_folder(obj=self, folder_name=folder_name, folder_address=folder_address, export_str_as=export_str_as, export_dict_as=export_dict_as, export_pandas_as=export_pandas_as, export_network_as=export_network_as) class Entities: """ - This is an Entities object. It contains a collection of Entities objects and compiles data about them. - - Parameters - ---------- - + This is an Entities object. It is intended to be a superclass for Authors, Funders, and Affiliations classes. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Entites collection's data. + all : dict + a dictionary storing formatted Entity objects. + data : list + a list of any unformatted data associated with Entity objects in the collection. """ def __init__(self): """ Initialises Entities instance. - - Parameters - ---------- """ self.summary = pd.DataFrame(dtype = object) @@ -154,7 +297,7 @@ def __init__(self): def __getitem__(self, key): """ - Retrieves entities attribute using a key. + Retrieves Entities attribute using a key. """ if key in self.__dict__.keys(): @@ -174,10 +317,24 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Entities objects are represented in string form. + """ + string = str(self.summary).replace('[','').replace(']','') return string def __len__(self) -> int: + + """ + Returns the number of Entity objects in the Entities collection. + + Returns + ------- + result : int + the number of Entity objects contained in the Entities collection. + """ + return len(self.all.keys()) def export_folder(self, @@ -187,8 +344,45 @@ def export_folder(self, export_dict_as: str = 'json', export_pandas_as: str = 'csv', export_network_as: str = 'graphML' - ): + ): + """ + Exports the Entities collection to a folder. + + Parameters + ---------- + folder_name : str + name of folder to create. Defaults to requesting from user input. + folder_address : str + directory address to create folder in. defaults to requesting for user input. + export_str_as : str + file type for exporting string objects. Defaults to 'txt'. + export_dict_as : str + file type for exporting dictionary objects. Defaults to 'json'. + export_pandas_as : str + file type for exporting Pandas objects. Defaults to 'csv'. + export_network_as : str + file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) + """ art_class_to_folder(obj=self, folder_name=folder_name, folder_address=folder_address, export_str_as=export_str_as, export_dict_as=export_dict_as, export_pandas_as=export_pandas_as, export_network_as=export_network_as) @@ -201,11 +395,62 @@ def save_as(self, export_pandas_as: str = 'csv', export_network_as: str = 'graphML'): + """ + Saves the Entities collection to a new file of a user-selected filetype with an inputted name at a specified location. + + Parameters + ---------- + filetype : str + type of file to save. Defaults to 'folder'. + file_name : str + name of file to create. Defaults to requesting from user input. + folder_address : str + directory address of folder to create file in. defaults to requesting from user input. + export_str_as : str + file type for exporting string objects. Defaults to 'txt'. + export_dict_as : str + file type for exporting dictionary objects. Defaults to 'json'. + export_pandas_as : str + file type for exporting Pandas objects. Defaults to 'csv'. + export_network_as : str + file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + filetype: + * folder (Default) + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) + """ + if filetype == 'folder': self.export_folder(folder_name=file_name, folder_address=folder_address, export_str_as=export_str_as, export_dict_as=export_dict_as, export_pandas_as=export_pandas_as, export_network_as=export_network_as) def drop(self, entity_id): + + """ + Removes an Entity entry from the collection. + + Parameters + ---------- + entity_id : str + the ID of the Entity to remove. Can be an author_id, funder_id, or affiliation_id + """ if entity_id in self.all.keys(): del self.all[entity_id] @@ -226,6 +471,20 @@ def drop(self, entity_id): def merge(self, entities): + """ + Merges the Entities collection with another Entities collection. + + Parameters + ---------- + entities : Entities or Authors or Funders or Affiliations + the Entities collection to merge with. + + Returns + ------- + self : Entities + the merged Entities collections. + """ + left = self.summary.copy(deep=True) right = entities.summary.copy(deep=True) @@ -265,14 +524,32 @@ def merge(self, entities): return self - def with_crossref(self): + def has_crossref(self): + + """ + Returns summary data on all Entities entries which have CrossRef IDs. + + Returns + ------- + result : pandas.DataFrame + a masked dataframe. + """ if 'crossref_id' in self.summary.columns: return self.summary[~self.summary['crossref_id'].isna()] else: return pd.DataFrame(index=self.summary.columns, dtype=object) - def with_uri(self): + def has_uri(self): + + """ + Returns summary data on all Entities entries which have URIs (e.g. DOIs). + + Returns + ------- + result : pandas.DataFrame + a masked dataframe. + """ if 'uri' in self.summary.columns: return self.summary[~self.summary['uri'].isna()] @@ -281,6 +558,10 @@ def with_uri(self): def contains(self, query: str = 'request_input', ignore_case: bool = True) -> bool: + """ + Returns True if the Entities collection contains data that matches a string. + """ + if query == 'request_input': query = input('Search query').strip() @@ -343,6 +624,20 @@ def contains(self, query: str = 'request_input', ignore_case: bool = True) -> bo def search_ids(self, query: str = 'request_input'): + """ + Searches Entity ID fields for a string. + + Parameters + ---------- + query : str + a string to search for. Defaults to requesting from user input. + + Returns + ------- + result : pandas.DataFrame + search results. + """ + if query == 'request_input': query = input('Search query').strip() @@ -366,6 +661,20 @@ def search_ids(self, query: str = 'request_input'): def search(self, query: str = 'request_input'): + """ + Searches summary data fields for a string. + + Parameters + ---------- + query : str + a string to search for. Defaults to requesting from user input. + + Returns + ------- + result : pandas.DataFrame + search results. + """ + if query == 'request_input': query = input('Search query').strip() diff --git a/art/classes/funders.py b/art/classes/funders.py index 8073574..b395f18 100644 --- a/art/classes/funders.py +++ b/art/classes/funders.py @@ -15,6 +15,20 @@ def generate_funder_id(funder_data: pd.Series): + """ + Takes a Pandas Series containing funder details and returns a unique identifier code. + + Parameters + ---------- + funder_data : pandas.Series + a series containing funder data + + Returns + ------- + funder_id : str + a funder ID. + """ + funder_data = funder_data.copy(deep=True).dropna().astype(str).str.lower() funder_id = 'F:' @@ -71,14 +85,41 @@ def generate_funder_id(funder_data: pd.Series): class Funder(Entity): """ - This is a Funder object. It is designed to store data about an individual funder and their publications. + This is a Funder object. It is designed to store data about an individual Funder and their publications. Parameters ---------- - + funder_id : str + a unique identifier assigned to the funder. Defaults to None. + name : str + the funder's name. Defaults to None. + alt_names : list + a list of alternate names. Defaults to an empty list. + location : str + an address associted with the funder. Defaults to None. + email : str + the funder's email address. Defaults to None. + uri : str + a DOI or other unique identifier assigned to the funder. Defaults to None. + crossref_id : str + a CrossRef identifier assigned to the funder. Defaults to None. + work_count : int + the number of publications associated with the fudner. Defaults to None. + tokens : list + a list of strings associated with the funder. Defaults to None. + website : str + the funder's website. Defaults to None + other_links : str or list + any other links associated with the funder. Defaults to None. + use_api : bool + whether to update funder data using APIs (e.g. CrossRef). Defaults to None. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Funder's data. + publications : Results + a Results dataframe containing data on the Funder's publications. """ def __init__(self, @@ -97,10 +138,34 @@ def __init__(self, ): """ - Initialises funder instance. + Initialises Funder instance. Parameters ---------- + funder_id : str + a unique identifier assigned to the funder. Defaults to None. + name : str + the funder's name. Defaults to None. + alt_names : list + a list of alternate names. Defaults to an empty list. + location : str + an address associted with the funder. Defaults to None. + email : str + the funder's email address. Defaults to None. + uri : str + a DOI or other unique identifier assigned to the funder. Defaults to None. + crossref_id : str + a CrossRef identifier assigned to the funder. Defaults to None. + work_count : int + the number of publications associated with the funder. Defaults to None. + tokens : list + a list of strings associated with the funder. Defaults to None. + website : str + the funder's website. Defaults to None + other_links : str or list + any other links associated with the funder. Defaults to None. + use_api : bool + whether to update funder data using APIs (e.g. CrossRef). Defaults to None. """ super().__init__() @@ -167,6 +232,15 @@ def __init__(self, def generate_id(self): + """ + Returns a unique identifier (funder ID) based on the Funder's data. + + Returns + ------- + funder_id : str + a funder ID. + """ + funder_data = self.summary.loc[0] funder_id = generate_funder_id(funder_data) # type: ignore @@ -174,6 +248,10 @@ def generate_id(self): def update_id(self): + """ + Replaces the Funder's existing unique identifier with a newly generated unique identifier based on the Funder's data. + """ + current_id = self.summary.loc[0, 'funder_id'] if (current_id == None) or (current_id == 'None') or (current_id == '') or (current_id == 'F:000'): @@ -184,7 +262,17 @@ def update_id(self): def __getitem__(self, key): """ - Retrieves funder attribute using a key. + Retrieves a Funder attribute or datapoint using a key. The key may be an attribute name, dataframe index position, or dataframe column name. + + Parameters + ---------- + key : object + an attribute name, dataframe index position, or dataframe column name. + + Returns + ------- + value : object + an object associated with the inputted key. """ if key in self.__dict__.keys(): @@ -198,10 +286,18 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Funder objects are represented in string form. + """ + return str(self.summary.loc[0, 'name']) def has_uri(self) -> bool: + """ + Returns True if the Funder has a URI associated. Else, returns False. + """ + uri = self.summary.loc[0, 'uri'] if (type(uri) == str) and (uri != ''): @@ -211,6 +307,10 @@ def has_uri(self) -> bool: def add_dict(self, data: dict): + """ + Adds a dictionary of funder data to the Funder's summary dataframe. + """ + if 'name' in data.keys(): name = data['name'] self.summary.loc[0, 'name'] = name @@ -221,6 +321,22 @@ def add_dict(self, data: dict): def from_dict(data: dict, use_api=False): # type: ignore + """ + Takes a dictionary of funder data and returns a Funder object. + + Parameters + ---------- + data : dict + a dictionary of funder data. The dictionary must contain 'name' and/or 'doi' as keys. + use_api : bool + whether to update funder data using APIs (e.g. CrossRef). Defaults to None. + + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() funder.add_dict(data=data) @@ -230,19 +346,64 @@ def from_dict(data: dict, use_api=False): # type: ignore return funder def add_series(self, series: pd.Series): + + """ + Adds a Pandas Series object to the Funder's summary dataframe. + """ + self.summary.loc[0] = series def from_series(data: pd.Series): # type: ignore + + """ + Takes a Pandas Series and returns a Funder object. + + Parameters + ---------- + data : pandas.Series + a Pandas Series with indices that match the names of columns in the Funder summary dataframe. + + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() funder.add_series(data) return funder def add_dataframe(self, dataframe: pd.DataFrame): + + """ + Adds data from a Pandas DataFrame to the Funder object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Funder's summary dataframe. + """ + series = dataframe.loc[0] self.add_series(series) # type: ignore def from_dataframe(data: pd.DataFrame): # type: ignore + + """ + Takes a Pandas DataFrame and returns a Funder object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame with columns that match the names of columns in the Funder object's summary dataframe. + + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() funder.add_dataframe(data) @@ -250,6 +411,15 @@ def from_dataframe(data: pd.DataFrame): # type: ignore def import_crossref_result(self, crossref_result: pd.Series): + """ + Reads a CrossRef API result formatted as a pandas.Series and adds its data to the Funder object. + + Parameters + ---------- + crossref_result : pandas.Series + CrossRef API result. + """ + if 'name' in crossref_result.index: name = crossref_result['name'] else: @@ -301,6 +471,20 @@ def import_crossref_result(self, crossref_result: pd.Series): def from_crossref_result(crossref_result: pd.Series): # type: ignore + """ + Reads a CrossRef API result formatted as a pandas.Series and returns as a Funder object. + + Parameters + ---------- + crossref_result : pandas.Series. + CrossRef API result. + + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() funder.import_crossref_result(crossref_result=crossref_result) @@ -308,23 +492,77 @@ def from_crossref_result(crossref_result: pd.Series): # type: ignore def import_crossref(self, crossref_id: str, timeout = 60): + """ + Looks up a CrossRef funder ID and adds the result to the Funder object. + + Parameters + ---------- + crossref_id : str + CrossRef funder ID. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + res = lookup_funder(crossref_id, timeout) self.import_crossref_result(res.loc[0]) # type: ignore - def from_crossref(crossref_id: str): # type: ignore + def from_crossref(crossref_id: str, timeout = 60): # type: ignore + + """ + Looks up a CrossRef funder ID and returns the result as a Funder object. + + Parameters + ---------- + crossref_result : pandas.Series. + CrossRef API result. + timeout : int + maximum time to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() - funder.import_crossref(crossref_id) # type: ignore + funder.import_crossref(crossref_id, timeout) # type: ignore return funder def import_uri(self, uri: str, timeout = 60): + """ + Looks up a funder URI using the CrossRef API and adds the result to the Funder object. + + Parameters + ---------- + uri : str + funder URI. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + res = lookup_funder(uri, timeout) self.import_crossref_result(res.loc[0]) # type: ignore def from_uri(uri: str): # type: ignore + """ + Looks up a funder URI using the CrossRef API and returns the result as a Funder object. + + Parameters + ---------- + uri : str + funder URI. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + funder : Funder + a Funder object. + """ + funder = Funder() funder.import_uri(uri) # type: ignore @@ -332,6 +570,15 @@ def from_uri(uri: str): # type: ignore def update_from_crossref(self, timeout = 60): + """ + Looks up the Funder's CrossRef funder ID. If one is found, uses to update the Funder object. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + uid = self.summary.loc[0,'crossref_id'] if uid == None: uid = self.summary.loc[0,'uri'] @@ -344,6 +591,15 @@ def update_from_crossref(self, timeout = 60): def update_from_uri(self, timeout = 60): + """ + Looks up the Funder's URI using the CrossRef API. If one is found, uses to update the Funder object. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + """ + uid = self.summary.loc[0,'uri'] if uid == None: uid = self.summary.loc[0,'crossref'] @@ -354,7 +610,8 @@ def update_from_uri(self, timeout = 60): if len(res) > 0: self.import_crossref_result(res.loc[0]) # type: ignore - def search_works(self, bibliographic: str = None, # type: ignore + def search_works(self, + bibliographic: str = None, # type: ignore title: str = None, # type: ignore author: str = None, # type: ignore author_affiliation: str = None, # type: ignore @@ -373,6 +630,56 @@ def search_works(self, bibliographic: str = None, # type: ignore timeout: int = 60, add_to_publications = False) -> pd.DataFrame: + """ + Searches for publications associated with the Funder using the CrossRef API. + + Parameters + ---------- + bibliographic : str + a combined search. Searches for titles, abstracts, author names, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for author names containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + how long in seconds to wait for results before raising an error. + add_to_publications : bool + whether to add search results to the Funder object. + filter : dict + select : list + + Returns + ------- + result : pandas.DataFrame + results from CrossRef API search. + """ + uid = self.summary.loc[0, 'crossref_id'] if (uid == None) or (uid == ''): uid = self.summary.loc[0, 'uri'] @@ -409,14 +716,21 @@ def search_works(self, bibliographic: str = None, # type: ignore class Funders(Entities): """ - This is a Funders object. It contains a collection of Funders objects and compiles data about them. + This is a Funders object. It contains a collection of Funder objects and a summary of data about them. Parameters ---------- - + funders_data : list or dict + Optional: an iterable of funders data. Data on individual funders must be formatted as dictionaries. Attributes ---------- + summary : pandas.DataFrame + a dataframe summarising the Funders collection's data. + all : dict + a dictionary storing formatted Funder objects. + data : list + a list of any unformatted data associated with Funder objects in the collection. """ def __init__(self, funders_data = None): @@ -426,6 +740,8 @@ def __init__(self, funders_data = None): Parameters ---------- + funders_data : list or dict + Optional: an iterable of funders data. Data on individual funders must be formatted as dictionaries. """ super().__init__() @@ -481,7 +797,7 @@ def __init__(self, funders_data = None): def __getitem__(self, key): """ - Retrieves funders attribute using a key. + Retrieves Funders attribute using a key. """ if key in self.__dict__.keys(): @@ -498,14 +814,46 @@ def __getitem__(self, key): def __repr__(self) -> str: + """ + Defines how Funders objects are represented in string form. + """ + alphabetical = str(self.summary['name'].sort_values().to_list()).replace('[','').replace(']','') return alphabetical def __len__(self) -> int: + + """ + Returns the number of Funder objects in the Funders collection. Counts the number of Funder objects stored in the Funders.all dictionary. + + Returns + ------- + result : int + the number of Funder objects contained in the Funders.all dictionary. + """ + return len(self.all.keys()) def merge(self, funders, drop_empty_rows = True, drop_duplicates = False): + """ + Merges the Funders collection with another Funders collection. + + Parameters + ---------- + funders : Funders + the Funders collection to merge with. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + self : Funders + the merged Funders collection. + """ + left = self.summary.copy(deep=True) right = funders.summary.copy(deep=True) @@ -553,10 +901,28 @@ def merge(self, funders, drop_empty_rows = True, drop_duplicates = False): return self - def add_funder(self, funder: Funder = None, uri: str = None, crossref_id: int = None, data = None, use_api = True, drop_empty_rows = False, drop_duplicates = False): # type: ignore - + """ + Adds a Funder or funder data to the Funders collection. + + Parameters + ---------- + funder : Funder + a Funder object to add. + uri : str + a URI identifier to look up. Defaults to None. + crossref_id : str + a CrossRef ID to look up. Defaults to None. + data : dict + Optional: a dictionary containing funder data. Dictionary keys must match the names of columns in the Funders.summary dataframe. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + use_api : bool + whether to update the Funder data using the CrossRef API. Defaults to False. + """ if type(funder) == str: @@ -617,9 +983,21 @@ def add_funder(self, funder: Funder = None, uri: str = None, crossref_id: int = if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def add_funders_list(self, funders_list: list, drop_empty_rows = False, drop_duplicates = False): + """ + Adds a list containing Funder objects to the Funders collection. + + Parameters + ---------- + funders_list : list[Funder] + a list of Funder objects. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in funders_list: if type(i) == Funder: self.add_funder(funder = i) @@ -632,6 +1010,15 @@ def add_funders_list(self, funders_list: list, drop_empty_rows = False, drop_dup def drop_empty_rows(self): + """ + Drops rows that contain no data from Funders.summary dataframe. + + Returns + ------- + self : Funders + a Funders object. + """ + ignore_cols = ['funder_id', 'alt_names', 'publications', 'tokens', 'other_links'] df = self.summary.copy(deep=True) @@ -646,6 +1033,22 @@ def drop_empty_rows(self): def remove_duplicates(self, drop_empty_rows = True, sync = False): + """ + Removes duplicate Funder entries from the Funders collection. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to True. + sync : bool + whether to synchronise the Funders.summary dataframe with the Funders.all dictionary. Defaults to False. + + Returns + ------- + self : Funders + a Funders object. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -656,12 +1059,23 @@ def remove_duplicates(self, drop_empty_rows = True, sync = False): self.summary = deduplicate(self.summary) if sync == True: - self.sync_details(drop_duplicates=False, drop_empty_rows=False) + self.sync_summary(drop_duplicates=False, drop_empty_rows=False) return self def sync_all(self, drop_duplicates = False, drop_empty_rows=False): + """ + Updates the Funders.summary dataframe using the Funder objects in the Funders.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in self.all.keys(): funder = self.all[i] funder.update_id() @@ -680,7 +1094,18 @@ def sync_all(self, drop_duplicates = False, drop_empty_rows=False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def sync_details(self, drop_duplicates = False, drop_empty_rows=False): + def sync_summary(self, drop_duplicates = False, drop_empty_rows=False): + + """ + Updates all Funder objects in the Funders.all dictionary using the Funders.summary dataframe. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ self.update_ids(sync=False) @@ -717,9 +1142,19 @@ def sync_details(self, drop_duplicates = False, drop_empty_rows=False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=drop_empty_rows) - def sync(self, drop_duplicates = False, drop_empty_rows=False): + """ + Synchronises the Funders.summary dataframe with the Funder objects in the Funders.all dictionary. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -730,19 +1165,28 @@ def sync(self, drop_duplicates = False, drop_empty_rows=False): details_len = len(self.all) if all_len > details_len: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: if details_len > all_len: self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return else: - self.sync_details(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + self.sync_summary(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) self.sync_all(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return def update_ids(self, sync=False): + """ + Updates funder IDs for all rows in the Funders.summary dataframe. + + Parameters + ---------- + sync : bool + whether to synchronise the Funders.summary dataframe with the Funder objects in the Funders.all dictionary. Defaults to False. + """ + if sync == True: self.sync() @@ -772,9 +1216,19 @@ def update_ids(self, sync=False): funder.summary.loc[0, 'funder_id'] = new_id self.all[new_id] = funder - def update_from_crossref(self, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up all Funders' CrossRef IDs and/or URIs using the CrossRef API. If found, uses to update the Funders collection. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + funder_ids = self.all.keys() for a in funder_ids: @@ -798,9 +1252,20 @@ def update_from_crossref(self, drop_duplicates = False, drop_empty_rows=False): self.update_ids() + def import_crossref_ids(self, crossref_ids: list, drop_duplicates = False, drop_empty_rows=False): + """ + Looks up a list of CrossRef funder IDs and/or URIs using the CrossRef API. Adds any data found to the Funders collection. - def import_crossref_ids(self, crossref_ids: list, drop_duplicates = False, drop_empty_rows=False): + Parameters + ---------- + crossref_ids : list[str] + list containing CrossRef funder IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ for i in crossref_ids: @@ -815,22 +1280,64 @@ def import_crossref_ids(self, crossref_ids: list, drop_duplicates = False, drop_ self.update_ids() - def from_crossref_ids(crossref_ids: list, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Looks up a list of CrossRef funder IDs and/or URIs using the CrossRef API. Returns all data found as a Funders object. + + Parameters + ---------- + crossref_ids : list[str] + list containing CrossRef funder IDs. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + funders : Funders + a Funders object. + """ + funders = Funders() funders.import_crossref_ids(crossref_ids, drop_duplicates = drop_duplicates, drop_empty_rows=drop_empty_rows) return funders - def with_crossref(self): + def has_crossref(self): + + """ + Returns all rows in Funders.summary which contain CrossRef IDs. + """ + return self.summary[~self.summary['crossref_id'].isna()] - def with_uri(self): + def has_uri(self): + + """ + Returns all rows in Funders.summary which contain URIs. + """ + return self.summary[~self.summary['uri'].isna()] def import_crossref_result(self, crossref_result: pd.DataFrame, use_api = False, drop_duplicates = False, drop_empty_rows=False): + """ + Reads a pandas.DataFrame containing CrossRef API results and adds the data to the Funders collection. + + Parameters + ---------- + crossref_result : pandas.Dataframe + CrossRef API results. + use_api : bool + whether to update the Funders data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + for i in crossref_result.index: data = crossref_result.loc[i] @@ -845,9 +1352,29 @@ def import_crossref_result(self, crossref_result: pd.DataFrame, use_api = False, self.update_ids() - def from_crossref_result(crossref_result: pd.DataFrame, use_api = False, drop_duplicates = False, drop_empty_rows=False): # type: ignore + """ + Reads a pandas.DataFrame containing CrossRef API results and returns as a Funders object. + + Parameters + ---------- + crossref_result : pandas.Dataframe + CrossRef API results. + use_api : bool + whether to update the Funders data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + + Returns + ------- + funders : Funders + a Funders object. + """ + + funders = Funders() funders.import_crossref_result(crossref_result, use_api=use_api, drop_duplicates = drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -877,6 +1404,58 @@ def search_works(self, timeout: int = 60, add_to_publications = False) -> pd.DataFrame: + """ + Searches for publications associated with a Funder using the CrossRef API. + + Parameters + ---------- + funder_id : str + a funder ID to look up. Defaults to None. + bibliographic : str + a combined search. Searches for titles, abstracts, author names, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for author names containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + how long in seconds to wait for results before raising an error. + add_to_publications : bool + whether to add search results to the Funder object. + filter : dict + select : list + + Returns + ------- + result : pandas.DataFrame + results from CrossRef API search. + """ + if (funder_id != None) and (funder_id in self.all.keys()): funder = self.all[funder_id] result = funder.search_works( @@ -951,6 +1530,21 @@ def search_works(self, def format_funders(funder_data, use_api = False, drop_duplicates = False, drop_empty_rows=False): + """ + Formats a collection of funders data as a Funders object. + + Parameters + ---------- + funder_data : object + a collection of funders data. + use_api : bool + whether to update the Funders data using the CrossRef API. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + """ + result = Funders() funder_type = type(funder_data) diff --git a/art/classes/networks.py b/art/classes/networks.py index 5d7305b..7d13cdf 100644 --- a/art/classes/networks.py +++ b/art/classes/networks.py @@ -1,4 +1,5 @@ +from networkx import degree from ..networks.network_functions import colinks_in, colinks_out from ..exporters.network_exporters import export_network, export_network_to_kumu @@ -113,6 +114,10 @@ def __init__(self, graph = None, n = 0, edges = None, directed = False, graph_at def __repr__(self): + """ + Defines how Network objects are represented in string form. + """ + if self.is_directed() == True: dir = 'Directed' else: @@ -155,52 +160,88 @@ def __repr__(self): return output def is_weighted(self) -> bool: + + """ + Checks whether edges have a 'weight' attribute. Returns True if yes; False if no. + """ + return 'weight' in self.es.attributes() - def degrees_dataframe(self, direction = 'all'): + def degrees(self): """ - Returns the network's degree distribution as a dataframe. + Returns the network's degree distributions as a Pandas DataFrame. """ - degrees_dataframe = pd.DataFrame(columns = ['vertex', 'degree']) - degrees = Network.degree(self, mode = direction) + isdir = self.is_directed() + + if isdir == False: + degrees_dataframe = pd.DataFrame(columns = ['vertex', 'degree']) + total_degrees = Network.degree(self, mode = 'all') + else: + + degrees_dataframe = pd.DataFrame(columns = ['vertex', 'total_degree', 'in_degree', 'out_degree']) + total_degrees = Network.degree(self, mode = 'all') + in_degrees = Network.degree(self, mode = 'in') + out_degrees = Network.degree(self, mode = 'out') index = 0 - for item in self.vs['name']: - degrees_dataframe.loc[index] = [item, degrees[index]] + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index + + degrees_dataframe.loc[index, 'vertex'] = item + + if isdir == False: + degrees_dataframe.loc[index, 'degree'] = total_degrees[index] + degrees_dataframe = degrees_dataframe.sort_values('degree', ascending=False) + else: + degrees_dataframe.loc[index, 'total_degree'] = total_degrees[index] + degrees_dataframe.loc[index, 'in_degree'] = in_degrees[index] + degrees_dataframe.loc[index, 'out_degree'] = out_degrees[index] + degrees_dataframe = degrees_dataframe.sort_values('total_degree', ascending=False) index += 1 degrees_dataframe.index.name = 'vertex_id' - degrees_dataframe = degrees_dataframe.sort_values('degree', ascending=False) return degrees_dataframe - def degree_stats(self, direction = 'all'): + def degrees_stats(self): """ - Returns frequency statistics for the network's degree distribution. + Returns frequency statistics for the network's degree distributions in a Pandas DataFrame. """ - df = self.degrees_dataframe(direction = direction) + deg_df = self.degrees() - if df is not None: - return df['degree'].describe() + if deg_df is not None: + cols = deg_df.columns.to_list() + cols.remove('vertex') + stats_df = pd.DataFrame(columns=cols) + for c in cols: + stats_df[c] = deg_df[c].describe() + return stats_df else: return None def betweenness_dataframe(self): """ - Returns the network's betweenness centrality distribution as a dataframe. + Returns the network's betweenness centrality distribution as a Pandas DataFrame. """ df = pd.DataFrame(columns = ['vertex', 'betweenness']) betweenness = Network.betweenness(self) index = 0 - for item in self.vs['name']: + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index df.loc[index] = [item, betweenness[index]] index += 1 @@ -225,7 +266,7 @@ def betweenness_stats(self): def eigencentralities_dataframe(self): """ - Returns the network's eigenvector centrality distribution as a dataframe. + Returns the network's eigenvector centrality distribution as a Pandas DataFrame. """ if self.is_directed() == True: @@ -237,7 +278,11 @@ def eigencentralities_dataframe(self): eigencentrality = Network.eigenvector_centrality(self) index = 0 - for item in self.vs['name']: + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index df.loc[index] = [item, eigencentrality[index]] index += 1 @@ -263,14 +308,18 @@ def eigencentralities_stats(self): def authority_scores_dataframe(self): """ - Returns the network's authority scores distribution as a dataframe. + Returns the network's authority scores distribution as a Pandas DataFrame. """ df = pd.DataFrame(columns = ['vertex', 'authority_score']) authority_scores = Network.authority_score(self) index = 0 - for item in self.vs['name']: + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index df.loc[index] = [item, authority_scores[index]] index += 1 @@ -296,14 +345,18 @@ def authority_scores_stats(self): def hub_scores_dataframe(self): """ - Returns the network's hub scores distribution as a dataframe. + Returns the network's hub scores distribution as a Pandas DataFrame. """ df = pd.DataFrame(columns = ['vertex', 'hub_score']) hub_scores = Network.hub_score(self) index = 0 - for item in self.vs['name']: + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index df.loc[index] = [item, hub_scores[index]] index += 1 @@ -329,14 +382,18 @@ def hub_scores_stats(self): def coreness_dataframe(self): """ - Returns the network's coreness scores distribution as a dataframe. + Returns the network's coreness scores distribution as a Pandas DataFrame. """ df = pd.DataFrame(columns = ['vertex', 'coreness']) coreness = Network.coreness(self) index = 0 - for item in self.vs['name']: + for v in self.vs: + if 'name' in v.attributes().keys(): + item = v['name'] + else: + item = v.index df.loc[index] = [item, coreness[index]] index += 1 @@ -362,7 +419,18 @@ def coreness_stats(self): def community_detection(self, algorithm='fastgreedy'): - """Identifies communities in the network. Gives the option of using different algorithms.""" + """Identifies communities in the network. Gives the option of using different algorithms. + + Parameters + ---------- + algorithm : str + name of community detection algorithm. Options: + 1. betweenness + 2. fastgreedy + 3. eigenvector + 4. spinglass + 5. walktrap + """ if (algorithm == None) or (algorithm == ''): algorithm = input('Algorithm must be specified. Options: 1. betweenness, 2. fastgreedy, 3. eigenvector, 4. spinglass, 5. walktrap.:') @@ -447,89 +515,127 @@ def weighted_density(self): return weighted_density - def weighted_degrees_dataframe(self, direction = 'all'): + def weighted_degrees(self): - """Returns the network's weighted degrees distribution as a dataframe.""" + """Returns the network's weighted degree distributions as a Pandas Dataframe.""" + # Checks if network is directed + isdir = self.is_directed() # Checks if network is weighted if 'weight' not in self.es.attributes(): - degrees_dataframe = self.degrees_dataframe(direction = direction) - degrees_dataframe['weighted_degree'] = degrees_dataframe['degree'] - degrees_dataframe = degrees_dataframe.drop('degree', axis = 1) - return degrees_dataframe - + degrees_dataframe = self.degrees() + + if isdir == False: + degrees_dataframe = degrees_dataframe.rename(columns={'degree':'weighted_degree'}) + else: + degrees_dataframe = degrees_dataframe.rename(columns={'total_degree':'weighted_total_degree', 'in_degree': 'weighted_in_degree', 'out_degree': 'weighted_out_degree'}) + + return degrees_dataframe + else: + if isdir == False: + cols = ['vertex', 'weighted_degree'] + directions = {'all': 'weighted_degree'} + else: + cols = ['vertex', 'weighted_total_degree', 'weighted_in_degree', 'weighted_out_degree'] + directions = {'all': 'weighted_total_degree', 'in': 'weighted_in_degree', 'out': 'weighted_out_degree'} - degrees_dataframe = pd.DataFrame(columns = ['vertex', 'weighted_degree']) + degrees_dataframe = pd.DataFrame(columns = cols) index = 0 for vertex in self.vs: weighted_degree = 0 - incident_edges = (Network.incident(self, vertex)) - for edge in incident_edges: - weight = self.es[edge]['weight'] - weighted_degree += weight - degrees_dataframe.loc[index] = [vertex['name'], weighted_degree] + if 'name' in vertex.attributes().keys(): + degrees_dataframe.loc[index, 'vertex'] = vertex['name'] + else: + degrees_dataframe.loc[index, 'vertex'] = vertex.index + for d in directions.keys(): + colname = directions[d] + incident_edges = (Network.incident(self, vertex, mode=d)) + for edge in incident_edges: + weight = self.es[edge]['weight'] + weighted_degree += weight + degrees_dataframe.loc[index, colname] = weighted_degree index += 1 - degrees_dataframe = degrees_dataframe.sort_values('weighted_degree', ascending=False) + if isdir == False: + degrees_dataframe = degrees_dataframe.sort_values('weighted_degree', ascending=False) + else: + degrees_dataframe = degrees_dataframe.sort_values('weighted_total_degree', ascending=False) + + degrees_dataframe.index.name = 'vertex_id' return degrees_dataframe - def weighted_degrees_stats(self, direction = 'all'): + def weighted_degrees_stats(self): """ - Returns frequency statistics for the weighted degree distribution. + Returns frequency statistics for the weighted degree distributions as a Pandas DataFrame. """ - - df = self.weighted_degrees_dataframe(direction = direction) + + df = self.weighted_degrees() if df is not None: - return df['weighted_degree'].describe() + cols = df.columns.to_list() + cols.remove('vertex') + stats_df = pd.DataFrame(columns=cols) + for c in cols: + stats_df[c] = df[c].describe() + return stats_df + else: + return None - def degrees_dist(self, weighted = False, direction = 'all'): + def degree_distributions(self, weighted = False, direction = 'all'): """ - Returns either the weighted or unweighted degrees distribution as a dataframe. + Returns either the weighted or unweighted degree distributions as a Pandas Dataframe. """ - + + isdir = self.is_directed() + directions = {'all': 'weighted_total_degree', 'in': 'weighted_in_degree', 'out': 'weighted_out_degree'} + if weighted == True: - degrees_frame = self.weighted_degrees_dataframe(direction = direction) - freq_table = degrees_frame['weighted_degree'].value_counts() - dist_frame = pd.DataFrame({'weighted_degree':freq_table.index, 'counts':freq_table.values}) + degrees_frame = self.weighted_degrees() + else: + degrees_frame = self.degrees() - return dist_frame + cols = degrees_frame.columns.to_list() + cols.remove('vertex') + if isdir == False: + col = cols[0] else: - degrees_frame = self.degrees_dataframe(direction = direction) - freq_table = degrees_frame['degree'].value_counts() - dist_frame = pd.DataFrame({'degree':freq_table.index, 'counts':freq_table.values}) + col = directions[direction] + + freq_table = degrees_frame[col].value_counts() - return dist_frame + + dist_frame = pd.DataFrame({col:freq_table.index, 'frequency':freq_table.values}).sort_values(col, ascending=False) + return dist_frame - def all_centralities(self, sort_by = ['weighted_degree','degree', 'betweenness','eigencentrality','authority_score','hub_score']): + def all_centralities(self): """ - Calculates all centrality measures for network. Returns as a dataframe. + Calculates all centrality measures for network. Returns as a Pandas Dataframe. """ is_directed = self.is_directed() try: - degrees = self.degrees_dataframe().set_index('vertex').sort_index() + degrees = self.degrees().set_index('vertex').sort_index() except: degrees = pd.DataFrame() degrees.index.name = 'vertex' try: - weighted_degrees = self.weighted_degrees_dataframe().set_index('vertex').sort_index() + weighted_degrees = self.weighted_degrees().set_index('vertex').sort_index() except: weighted_degrees = pd.DataFrame() weighted_degrees.index.name = 'vertex' @@ -570,22 +676,30 @@ def all_centralities(self, sort_by = ['weighted_degree','degree', 'betweenness', hubs) if is_directed == True: - sort_by = ['weighted_degree','degree', 'betweenness','authority_score','hub_score'] - + sort_by = ['weighted_total_degree', 'total_degree', 'betweenness'] + else: + sort_by = ['weighted_degree', 'degree', 'betweenness', 'eigencentrality', 'authority_score','hub_score'] + + if combined.index.dtype == 'float64': + combined.index = combined.index.astype(int) + return combined.sort_values(sort_by, ascending=False) def get_neighbours(self, vertex_name = 'request_input'): - """Returns vertex neighbours as a dataframe.""" + """Returns vertex neighbours as a Pandas Dataframe.""" if vertex_name == 'request_input': - vertex_name = input('Vertex name: ') + vertex_name = input('Vertex name or index: ') # Get vertex - vertex = self.vs.find(name = vertex_name) + if 'name' in self.vs.attributes(): + vertex = self.vs.find(name = vertex_name) + else: + vertex = self.vs[vertex_name] - # Get vertex neighbours in a dataframe + # Get vertex neighbours in a Pandas DataFrame df = pd.DataFrame(columns = ['vertex_id', 'vertex_name']) @@ -610,8 +724,14 @@ def get_degree(self, vertex_name = 'request_input', direction = 'all'): if vertex_name == 'request_input': vertex_name = input('Vertex name: ') + + # Get vertex + if 'name' in self.vs.attributes(): + vertex = self.vs.find(name = vertex_name) + else: + vertex = self.vs[vertex_name] - degree = len(self.vs.find(name = vertex_name).neighbors()) + degree = len(vertex.neighbors()) degree = int(degree) return degree @@ -624,7 +744,7 @@ def get_weighted_degree(self, vertex_name = 'request_input', direction = 'all'): if vertex_name == 'request_input': vertex_name = input('Vertex name: ') - df = self.weighted_degrees_dataframe(direction = direction) + df = self.weighted_degrees(direction = direction) masked = df[df['vertex'] == vertex_name] degree = int(masked['weighted_degree']) # type: ignore @@ -649,7 +769,7 @@ def get_betweenness(self, vertex_name = 'request_input', direction = 'all'): def colinks(self, direction = 'out'): """ - Runs a colink analysis on the network. Returns a dataframe. + Runs a colink analysis on the network. Returns a Pandas DataFrame. Parameters ---------- @@ -715,6 +835,8 @@ def visualise(self, vertex_names = True, edge_weights = False, weight_by = 'weig def to_igraph(self) -> Graph: + """Returns the Network as an igraph Graph object.""" + is_dir = self.is_directed() g_attrs = self.attributes() @@ -1028,7 +1150,7 @@ def reciprocity(self, network = 'request_input', ignore_loops=True, mode='defaul def degrees_df(self, network = 'request_input', direction = 'all'): """ - Calculates the degree distribution of the network. Returns a dataframe. + Calculates the degree distribution of the network. Returns a Pandas DataFrame. Parameters ---------- @@ -1040,7 +1162,7 @@ def degrees_df(self, network = 'request_input', direction = 'all'): Returns ------- result : pandas.DataFrame - a dataframe containing the degree distribution of the graph. + a Pandas DataFrame containing the degree distribution of the graph. """ if network == 'request_input': @@ -1076,7 +1198,7 @@ def degrees_stats(self, network = 'request_input', direction = 'all'): Returns ------- result : pandas.DataFrame - a dataframe of frequency statistics for the degree distribution of the + a Pandas DataFrame of frequency statistics for the degree distribution of the graph. """ @@ -1093,7 +1215,7 @@ def degrees_stats(self, network = 'request_input', direction = 'all'): def betweenness_df(self, network = 'request_input', vertices=None, directed=True, cutoff=None, weights=None, sources=None, targets=None): """ - Calculates or estimates the betweenness of vertices in a network. Returns a dataframe. + Calculates or estimates the betweenness of vertices in a network. Returns a Pandas DataFrame. Also supports calculating betweenness with shortest path length cutoffs or considering shortest paths only from certain source vertices or to certain @@ -1126,7 +1248,7 @@ def betweenness_df(self, network = 'request_input', vertices=None, directed=True Returns ------- result : pandas.DataFrame - the (possibly cutoff-limited) betweenness of the given vertices in a dataframe. + the (possibly cutoff-limited) betweenness of the given vertices in a Pandas DataFrame. """ if network == 'request_input': @@ -1151,7 +1273,7 @@ def betweenness_df(self, network = 'request_input', vertices=None, directed=True def betweenness_stats(self, network = 'request_input', vertices=None, directed=True, cutoff=None, weights=None, sources=None, targets=None): """ - Returns frequency statistics for the betweenness of vertices in a network. Returns a dataframe. + Returns frequency statistics for the betweenness of vertices in a network. Returns a Pandas DataFrame. Parameters ---------- @@ -1180,7 +1302,7 @@ def betweenness_stats(self, network = 'request_input', vertices=None, directed=T Returns ------- result : pandas.DataFrame - frequency statistics for betweenness of the given vertices in a dataframe. + frequency statistics for betweenness of the given vertices in a Pandas DataFrame. """ if network == 'request_input': @@ -1197,7 +1319,7 @@ def betweenness_stats(self, network = 'request_input', vertices=None, directed=T def eigencentralities_df(self, network = 'request_input', scale=True, weights=None, return_eigenvalue=False): """ - Calculates the eigenvector centralities of the vertices in a graph. Returns a dataframe. + Calculates the eigenvector centralities of the vertices in a graph. Returns a Pandas DataFrame. Eigenvector centrality is a measure of the importance of a node in a network. It assigns relative scores to all nodes in the network based @@ -1241,7 +1363,7 @@ def eigencentralities_df(self, network = 'request_input', scale=True, weights=No Returns ------- result : pandas.DataFrame - the eigenvector centralities in a dataframe. + the eigenvector centralities in a Pandas DataFrame. """ if network == 'request_input': @@ -1315,7 +1437,7 @@ def eigencentralities_stats(self, network = 'request_input'): Returns ------- result : pandas.Series - frequency statistics for eigenvector centralities in a dataframe. + frequency statistics for eigenvector centralities in a Pandas DataFrame. """ if network == 'request_input': @@ -1331,7 +1453,7 @@ def eigencentralities_stats(self, network = 'request_input'): def authority_scores_df(self, network = 'request_input', weights=None, scale=True, return_eigenvalue=False): """ - Calculates Kleinberg's authority score for the vertices of the network. Returns a dataframe. + Calculates Kleinberg's authority score for the vertices of the network. Returns a Pandas DataFrame. Parameters ---------- @@ -1352,7 +1474,7 @@ def authority_scores_df(self, network = 'request_input', weights=None, scale=Tru Returns ------- result : pandas.DataFrame - the authority scores as a dataframe. + the authority scores as a Pandas DataFrame. """ if network == 'request_input': @@ -1377,7 +1499,7 @@ def authority_scores_df(self, network = 'request_input', weights=None, scale=Tru def authority_scores_stats(self, network = 'request_input', weights=None, scale=True, return_eigenvalue=False): """ - Returns frequency statistics for Kleinberg's authority score for the vertices of the network. Returns a dataframe. + Returns frequency statistics for Kleinberg's authority score for the vertices of the network. Returns a Pandas DataFrame. Parameters ---------- @@ -1398,7 +1520,7 @@ def authority_scores_stats(self, network = 'request_input', weights=None, scale= Returns ------- result : pandas.DataFrame - frequency statistics for authority scores as a dataframe. + frequency statistics for authority scores as a Pandas DataFrame. """ if network == 'request_input': @@ -1414,7 +1536,7 @@ def authority_scores_stats(self, network = 'request_input', weights=None, scale= def hub_scores_df(self, network = 'request_input', weights=None, scale=True, return_eigenvalue=False): """ - Calculates Kleinberg's hub score for the vertices of the graph. Returns a dataframe. + Calculates Kleinberg's hub score for the vertices of the graph. Returns a Pandas DataFrame. Parameters ---------- @@ -1435,7 +1557,7 @@ def hub_scores_df(self, network = 'request_input', weights=None, scale=True, ret Returns ------- result : pandas.DataFrame - the hub scores as a dataframe. + the hub scores as a Pandas DataFrame. """ if network == 'request_input': @@ -1460,7 +1582,7 @@ def hub_scores_df(self, network = 'request_input', weights=None, scale=True, ret def hub_scores_stats(self, network = 'request_input', weights=None, scale=True, return_eigenvalue=False): """ - Returns frequency statistisc for Kleinberg's hub score for the vertices of the graph. Returns a dataframe. + Returns frequency statistisc for Kleinberg's hub score for the vertices of the graph. Returns a Pandas DataFrame. Parameters ---------- @@ -1482,7 +1604,7 @@ def hub_scores_stats(self, network = 'request_input', weights=None, scale=True, Returns ------- result : pandas.DataFrame - frequency statistics for hub scores as a dataframe. + frequency statistics for hub scores as a Pandas DataFrame. """ if network == 'request_input': @@ -1498,7 +1620,7 @@ def hub_scores_stats(self, network = 'request_input', weights=None, scale=True, def coreness_df(self, network = 'request_input', mode='all'): """ - Finds the coreness (shell index) of the vertices of the network. Returns a dataframe. + Finds the coreness (shell index) of the vertices of the network. Returns a Pandas DataFrame. The M{k}-core of a graph is a maximal subgraph in which each vertex has at least degree k. (Degree here means the degree in the @@ -1686,7 +1808,7 @@ def decompose(self, network = 'request_input', mode='strong', maxcompno=None, mi def weighted_degrees_df(self, network = 'request_input', direction = 'all'): - """Calculates a network's weighted degrees and returns a dataframe.""" + """Calculates a network's weighted degrees and returns a Pandas DataFrame.""" if network == 'request_input': network = input('Network name: ') @@ -1739,7 +1861,7 @@ def weighted_degrees_stats(self, network = 'request_input', direction = 'all'): def degree_distribution(self, network = 'request_input', weighted = False, direction = 'all'): """ - Returns the network's weighted or unweighted degree distribution as a dataframe. + Returns the network's weighted or unweighted degree distribution as a Pandas DataFrame. """ if network == 'request_input': @@ -1761,74 +1883,22 @@ def degree_distribution(self, network = 'request_input', weighted = False, direc - def all_centralities(self, network = 'request_input', sort_by = ['weighted_degree','degree', 'betweenness','eigencentrality','authority_score','hub_score']): + def all_centralities(self, network = 'request_input'): """ - Calculates all centrality measures for network. Returns as a dataframe. + Calculates all centrality measures for network. Returns as a Pandas DataFrame. """ if network == 'request_input': network = input('Network name: ') network_obj = self.get_network(network) - is_directed = network_obj.is_directed() - - try: - degrees = self.degrees_df(network = network).set_index('vertex').sort_index() - except: - degrees = pd.DataFrame() - degrees.index.name = 'vertex' - - try: - weighted_degrees = self.weighted_degrees_df(network = network).set_index('vertex').sort_index() - except: - weighted_degrees = pd.DataFrame() - weighted_degrees.index.name = 'vertex' - - try: - eigencents = self.eigencentralities_df(network = network).set_index('vertex').sort_index() # type: ignore - except: - eigencents = pd.DataFrame() - eigencents.index.name = 'vertex' - - try: - betweenness = self.betweenness_df(network = network).set_index('vertex').sort_index() - except: - betweenness = pd.DataFrame() - betweenness.index.name = 'vertex' - - try: - auths = self.authority_scores_df(network = network).set_index('vertex').sort_index() - except: - auths = pd.DataFrame() - auths.index.name = 'vertex' - - try: - hubs = self.hub_scores_df(network = network).set_index('vertex').sort_index() - except: - hubs = pd.DataFrame() - hubs.index.name = 'vertex' - - combined = weighted_degrees.join( - degrees - ).join( - betweenness - ).join( - eigencents - ).join( - auths - ).join( - hubs) - - if is_directed == True: - sort_by = ['weighted_degree','degree', 'betweenness','authority_score','hub_score'] - - return combined.sort_values(sort_by, ascending=False) + return network_obj.all_centralities() def get_neighbours(self, network = 'request_input', vertex_name = 'request_input'): - """Returns vertex neighbours as a dataframe""" + """Returns vertex neighbours as a Pandas DataFrame""" if network == 'request_input': network = input('Network name: ') @@ -1841,7 +1911,7 @@ def get_neighbours(self, network = 'request_input', vertex_name = 'request_inpu # Get vertex vertex = network_obj.vs.find(name = vertex_name) - # Get vertex neighbours in a dataframe + # Get vertex neighbours in a Pandas DataFrame df = pd.DataFrame(columns = ['vertex_id', 'vertex_name']) @@ -1895,7 +1965,7 @@ def get_weighted_degree(self, network = 'request_input', vertex_name = 'request_ def get_item_all_degrees(self, item_id = 'request_input', weighted = False): """ - Returns a dataframe of degrees for all vertices representing an item. Takes an item ID. + Returns a Pandas DataFrame of degrees for all vertices representing an item. Takes an item ID. """ if item_id == 'request_input': diff --git a/art/classes/references.py b/art/classes/references.py index 2d3a355..76520ff 100644 --- a/art/classes/references.py +++ b/art/classes/references.py @@ -9,14 +9,96 @@ class References(Results): """ - This is a References object. It is a modified Pandas Dataframe object designed to store References relating to an entry. + This is a References DataFrame. It is a modified Pandas Dataframe object designed to store citations, references, and links associated with a published work. Parameters ---------- - - + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a Results DataFrame. Defaults to None. + index : list + list of indices for Results DataFrame. Defaults to an empty list. + Attributes ---------- + T : pandas.DataFrame + _AXIS_LEN : int + _AXIS_ORDERS : list + _AXIS_TO_AXIS_NUMBER : dict + _HANDLED_TYPES : tuple + __annotations__ : dict + __array_priority__ : int + _attrs : dict + _constructor : type + _constructor_sliced : type + _hidden_attrs : frozenset + _info_axis : pandas.Index + _info_axis_name : str + _info_axis_number : int + _internal_names : set + _is_homogeneous_type : bool + _is_mixed_type : bool + _is_view : bool + _item_cache : dict + _metadata : list + _series : dict + _stat_axis : pandas.Index + _stat_axis_name : str + _stat_axis_number : int + _typ : str + _values : numpy.ndarray + attrs : dict + axes : list + columns : pandas.Index + dtypes : pandas.Series + empty : bool + flags : pandas.Flags + index : pandas.Index + ndim : int + shape : tuple + size : numpy.int64 + values : numpy.ndarray + + Columns + ------- + * **work_id**: a unique identifier assigned to each result. + * **title**: the result's title. + * **authors**: any authors associated with the result. + * **date**: any date(s) or year(s) associated with the result. + * **source**: the name of the journal, conference, book, website, or other publication in which the result is contained (if any). + * **type**: result type (e.g. article, chapter, book, website). + * **editors**: any authors associated with the result. + * **publisher**: the name of the result's publisher (if any). + * **publisher_location**: any locations or addresses associated with the result's publisher. + * **funder**: any funders associated with the result. + * **keywords**: any keywords associated with the result. + * **abstract**: the result's abstract (if available). + * **description**: the result's abstract (if available). + * **extract**: the result's extract (if available). + * **full_text**: the result's full text (if available). + * **access_type**: the result's access type (e.g. open access, restricted access) + * **authors_data**: unformatted data on any authors associated with the result. + * **author_count**: the number of authors associated with the result. + * **author_affiliations**: any affiliations associated with the result's authors. + * **editors_data**: unformatted data on any editors associated with the result. + * **citations**: any citations/references/links associated with the result. + * **citation_count**: the number of citations/references/links associated with the result. + * **citations_data**: unformatted data on any citations/references/links associated with the result. + * **cited_by**: a list of publications that cite/reference/link to the result. + * **cited_by_count**: the number of publications that cite/reference/link to the result. + * **cited_by_data**: unformatted data on publications that cite/reference/link to the result. + * **recommendations**: data on recommended publications associated with the result. + * **crossref_score**: a bibliometric score assigned by CrossRef (if available). + * **repository**: the repository from which the result was retrieved (if available). + * **language**: the language(s) of the result. + * **doi**: the Digital Object Identifier (DOI) assigned to the result. + * **isbn**: the International Standard Book Number (ISBN) assigned to the result. + * **issn**: the International Standard Serial Number (ISSN) assigned to the result or its source. + * **pii**: any Publisher Item Identifiers (PII) assigned to the result. + * **scopus_id**: the Scopus identifier assigned to the result. + * **wos_id**: the Web of Science (WoS) identifier assigned to the result. + * **pubmed_id**: the PubMed Identifier (PMID) assigned to the result. + * **other_ids**: any other identifiers assigned to the result. + * **link**: a URL or other link to the result. """ def __init__(self, dataframe = None, index = []): @@ -26,6 +108,10 @@ def __init__(self, dataframe = None, index = []): Parameters ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a Results DataFrame. Defaults to None. + index : list + list of indices for Results DataFrame. Defaults to an empty list. """ @@ -36,10 +122,32 @@ def __init__(self, dataframe = None, index = []): def __repr__(self): + """ + Defines how References objects are represented in string form. + """ + return f'References object containing {len(self)} items' def from_dataframe(dataframe): # type: ignore + """ + Converts a Pandas DataFrame to a ReReferencesults object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a References object. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + + Returns + ------- + results_table : References + a References object. + """ + dataframe = dataframe.copy(deep=True).reset_index().drop('index', axis=1) results_table = References(index = dataframe.index) results_table.columns = results_table.columns.astype(str).str.lower().str.replace(' ', '_') @@ -54,13 +162,30 @@ def from_dataframe(dataframe): # type: ignore def is_formatted_reference(item): + """ + Returns True if the object is a Reference instance; else, returns False. + """ + if type(item) == References: return True else: return False -def extract_references(references_data, add_work_ids = True, update_from_doi = False): +def format_references(references_data, add_work_ids = True, update_from_doi = False): + + """ + Formats a collection of citations, references, and/or links as a References object. + + Parameters + ---------- + references_data : object + a collection of citations, references, and/or links. + add_work_ids : bool + whether to assign unique identifiers (work IDs). Defaults to True. + update_from_doi : bool + whether to update the References data using the CrossRef API. Defaults to False. + """ refs = References() diff --git a/art/classes/results.py b/art/classes/results.py index c0dcad7..13a4812 100644 --- a/art/classes/results.py +++ b/art/classes/results.py @@ -17,6 +17,20 @@ from pybtex.database import BibliographyData, Entry #type: ignore def generate_work_id(work_data: pd.Series): + + """ + Takes a Pandas Series containing details about a published work and returns a unique identifier code (work ID). + + Parameters + ---------- + work_data : pandas.Series + a series containing data on a published work. + + Returns + ------- + work_id : str + a work ID. + """ work_data = work_data.copy(deep=True).dropna() @@ -108,18 +122,99 @@ def generate_work_id(work_data: pd.Series): return work_id - class Results(pd.DataFrame): """ - This is a Results object. It is a modified Pandas Dataframe object designed to store the results of an academic review. + This is a Results DataFrame. It is a modified Pandas Dataframe object designed to store the results of an academic review. Parameters ---------- - - + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a Results DataFrame. Defaults to None. + index : list + list of indices for Results DataFrame. Defaults to an empty list. + Attributes ---------- + T : pandas.DataFrame + _AXIS_LEN : int + _AXIS_ORDERS : list + _AXIS_TO_AXIS_NUMBER : dict + _HANDLED_TYPES : tuple + __annotations__ : dict + __array_priority__ : int + _attrs : dict + _constructor : type + _constructor_sliced : type + _hidden_attrs : frozenset + _info_axis : pandas.Index + _info_axis_name : str + _info_axis_number : int + _internal_names : set + _is_homogeneous_type : bool + _is_mixed_type : bool + _is_view : bool + _item_cache : dict + _metadata : list + _series : dict + _stat_axis : pandas.Index + _stat_axis_name : str + _stat_axis_number : int + _typ : str + _values : numpy.ndarray + attrs : dict + axes : list + columns : pandas.Index + dtypes : pandas.Series + empty : bool + flags : pandas.Flags + index : pandas.Index + ndim : int + shape : tuple + size : numpy.int64 + values : numpy.ndarray + + Columns + ------- + * **work_id**: a unique identifier assigned to each result. + * **title**: the result's title. + * **authors**: any authors associated with the result. + * **date**: any date(s) or year(s) associated with the result. + * **source**: the name of the journal, conference, book, website, or other publication in which the result is contained (if any). + * **type**: result type (e.g. article, chapter, book, website). + * **editors**: any authors associated with the result. + * **publisher**: the name of the result's publisher (if any). + * **publisher_location**: any locations or addresses associated with the result's publisher. + * **funder**: any funders associated with the result. + * **keywords**: any keywords associated with the result. + * **abstract**: the result's abstract (if available). + * **description**: the result's abstract (if available). + * **extract**: the result's extract (if available). + * **full_text**: the result's full text (if available). + * **access_type**: the result's access type (e.g. open access, restricted access) + * **authors_data**: unformatted data on any authors associated with the result. + * **author_count**: the number of authors associated with the result. + * **author_affiliations**: any affiliations associated with the result's authors. + * **editors_data**: unformatted data on any editors associated with the result. + * **citations**: any citations/references/links associated with the result. + * **citation_count**: the number of citations/references/links associated with the result. + * **citations_data**: unformatted data on any citations/references/links associated with the result. + * **cited_by**: a list of publications that cite/reference/link to the result. + * **cited_by_count**: the number of publications that cite/reference/link to the result. + * **cited_by_data**: unformatted data on publications that cite/reference/link to the result. + * **recommendations**: data on recommended publications associated with the result. + * **crossref_score**: a bibliometric score assigned by CrossRef (if available). + * **repository**: the repository from which the result was retrieved (if available). + * **language**: the language(s) of the result. + * **doi**: the Digital Object Identifier (DOI) assigned to the result. + * **isbn**: the International Standard Book Number (ISBN) assigned to the result. + * **issn**: the International Standard Serial Number (ISSN) assigned to the result or its source. + * **pii**: any Publisher Item Identifiers (PII) assigned to the result. + * **scopus_id**: the Scopus identifier assigned to the result. + * **wos_id**: the Web of Science (WoS) identifier assigned to the result. + * **pubmed_id**: the PubMed Identifier (PMID) assigned to the result. + * **other_ids**: any other identifiers assigned to the result. + * **link**: a URL or other link to the result. """ def __init__(self, dataframe = None, index = []): @@ -129,6 +224,10 @@ def __init__(self, dataframe = None, index = []): Parameters ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a Results DataFrame. Defaults to None. + index : list + list of indices for Results DataFrame. Defaults to an empty list. """ if dataframe is None: @@ -148,6 +247,11 @@ def __init__(self, dataframe = None, index = []): def drop_empty_rows(self): + """ + Drops rows that contain no data. + """ + + ignore_cols = ['work_id', 'authors', 'funder', 'citations'] df = self.dropna(axis=0, how='all') @@ -160,10 +264,23 @@ def drop_empty_rows(self): return self + def remove_duplicates(self, drop_empty_rows = True, use_api = False): + """ + Removes duplicate results. - - def remove_duplicates(self, drop_empty_rows = True, update_from_api = False): + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to True. + use_api : bool + whether to update the results data using all available APIs. Defaults to False. + + Returns + ------- + self : Results + a Results object. + """ if drop_empty_rows == True: self.drop_empty_rows() @@ -173,7 +290,7 @@ def remove_duplicates(self, drop_empty_rows = True, update_from_api = False): df = deduplicate(self) results = Results.from_dataframe(dataframe = df, drop_duplicates=False) - if update_from_api == True: + if use_api == True: results.update_from_dois() results.update_work_ids() @@ -185,10 +302,12 @@ def remove_duplicates(self, drop_empty_rows = True, update_from_api = False): return self - - def get(self, work_id: str): + """ + Retrieves result using a work ID. + """ + indexes = self[self['work_id'] == work_id].index.to_list() if len(indexes) > 0: index = indexes[0] @@ -198,6 +317,15 @@ def get(self, work_id: str): def add_pdf(self, path = 'request_input'): + """ + Reads a PDF file from a file path or URL and adds its data to the Results DataFrame. + + Parameters + ---------- + path : str + a filepath or URL that directs to a PDF file. Defaults to requesting from user input. + """ + if path == 'request_input': path = input('Path to PDF (URL or filepath): ') @@ -213,7 +341,18 @@ def add_pdf(self, path = 'request_input'): return self - def add_row(self, data, drop_duplicates=True): + def add_row(self, data: pd.Series, drop_duplicates: bool = True): + + """ + Adds a Pandas Series to the Results DataFrame as a new row. + + Parameters + ---------- + data : pandas.Series + a row to add. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to True. + """ if type(data) != pd.Series: raise TypeError(f'Results must be a Pandas.Series, not {type(data)}') @@ -238,6 +377,10 @@ def add_row(self, data, drop_duplicates=True): def get_unique_id(self, work_id, index): + """ + Checks whether work ID is used more than once in the Results DataFrame. If yes, returns a unique ID. + """ + if (type(work_id) == str) and (work_id != ''): try: work_id = str(work_id.split('#')[0]) @@ -256,6 +399,21 @@ def get_unique_id(self, work_id, index): def add_dataframe(self, dataframe: pd.DataFrame, drop_empty_rows = True, drop_duplicates = False, update_work_ids = True): + """ + Adds a Pandas DataFrame to the Results DataFrame. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to add to the Results DataFrame. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to True. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + update_work_ids : bool + whether to update results entries' work IDs. Defaults to True. + """ + if (type(dataframe) != pd.DataFrame) and (type(dataframe) != pd.Series): raise TypeError(f'Results must be a Pandas.Series or Pandas.DataFrame, not {type(dataframe)}') @@ -288,6 +446,23 @@ def add_dataframe(self, dataframe: pd.DataFrame, drop_empty_rows = True, drop_du self.remove_duplicates(drop_empty_rows=drop_empty_rows) def add_doi(self, doi: str = 'request_input', drop_empty_rows = True, drop_duplicates = False, timeout: int = 60): + + """ + Looks up a DOI using the CrossRef API and adds to the Results DataFrame. + + Parameters + ---------- + doi : str + DOI to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + + df = lookup_doi(doi=doi, timeout=timeout) self.add_dataframe(dataframe=df) @@ -297,14 +472,37 @@ def add_doi(self, doi: str = 'request_input', drop_empty_rows = True, drop_dupli if drop_empty_rows == True: self.drop_empty_rows() + def add_dois(self, dois_list: list = [], drop_empty_rows = True, rate_limit: float = 0.05, timeout = 60): + + """ + Looks up a list of DOIs using the CrossRef API and adds to Results DataFrame. + Parameters + ---------- + dois_list : list + list of DOIs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ - def add_dois(self, dois_list: list = [], drop_empty_rows = True, rate_limit: float = 0.1, timeout = 60): df = lookup_dois(dois_list=dois_list, rate_limit=rate_limit, timeout=timeout) - self.add_dataframe(dataframe=df, drop_empty_rows = True) + self.add_dataframe(dataframe=df, drop_empty_rows = drop_empty_rows) def correct_dois(self, drop_duplicates = False): + """ + Checks all entries in Results DataFrame for correctly formatted DOIs. If none is found, checks whether URL contains a valid DOI and, if so, uses this. Additionally, strips existing DOIs of unnecessary strings. + + Parameters + ---------- + drop_duplicates : bool + whether to remove duplicated rows. + """ + no_doi = self[self['doi'].isna()] has_link = no_doi[~no_doi['link'].isna()] doi_in_link = has_link[has_link['link'].str.contains('doi.org')] @@ -319,12 +517,25 @@ def correct_dois(self, drop_duplicates = False): def generate_work_ids(self): + """ + Assigns a unique identifier (work ID) for each published work in the Results DataFrame. + """ + for i in self.index: work_id = generate_work_id(self.loc[i]) self.loc[i, 'work_id'] = work_id def update_work_ids(self, drop_duplicates = False): + """ + Checks each published work in the Results DataFrame to ensure the work ID is up-to-date. If not, generates and assigns a new work ID. + + Parameters + ---------- + drop_duplicates : bool + whether to remove duplicated rows. Defaults to True. + """ + for i in self.index: work_id = generate_work_id(self.loc[i]) if self.loc[i, 'work_id'] != work_id: @@ -334,9 +545,23 @@ def update_work_ids(self, drop_duplicates = False): if drop_duplicates == True: self.remove_duplicates(drop_empty_rows=False) - def update_from_doi(self, index, drop_empty_rows = True, drop_duplicates = False, timeout: int = 60): + """ + Updates a given result using the CrossRef API if it has a DOI associated. + + Parameters + ---------- + index : int or str + row index position for the result to update. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + try: old_series = self.loc[index] doi = old_series['doi'] @@ -362,9 +587,21 @@ def update_from_doi(self, index, drop_empty_rows = True, drop_duplicates = False if drop_empty_rows == True: self.drop_empty_rows() - def update_from_dois(self, drop_empty_rows = True, drop_duplicates = False, timeout: int = 60): + """ + Updates results that have DOIs associated using the CrossRef API. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + self.correct_dois(drop_duplicates=False) for i in self.index: @@ -378,6 +615,10 @@ def update_from_dois(self, drop_empty_rows = True, drop_duplicates = False, time def __add__(self, results_obj): + """ + Defines addition behaviour for Results DataFrames. Returns a left-wise inner join between the two DataFrames. + """ + left = self.copy(deep=True) right = results_obj.copy(deep=True) right.columns = right.columns.astype(str).str.lower().str.replace(' ', '_') @@ -399,10 +640,38 @@ def __add__(self, results_obj): return left def to_dataframe(self): + + """ + Converts Results object to a Pandas DataFrame. + + Returns + ------- + dataframe : pandas.DataFrame + the Results object converted to a Pandas DataFrame. + """ + return self.copy(deep=True) def from_dataframe(dataframe, drop_empty_rows = False, drop_duplicates = False): # type: ignore + """ + Converts a Pandas DataFrame to a Results object. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to convert to a Results object. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + + Returns + ------- + results_table : Results + a Results object. + """ + dataframe = dataframe.copy(deep=True).reset_index().drop('index', axis=1) results_table = Results(index = dataframe.index) results_table.columns = results_table.columns.astype(str).str.lower().str.replace(' ', '_') @@ -421,6 +690,15 @@ def from_dataframe(dataframe, drop_empty_rows = False, drop_duplicates = False): def to_pybtex(self): + """ + Converts the Results DataFrame to a Pybtex BibliographyData object. + + Returns + ------- + bib_data : pybtex.BibliographyData + a Pybtex BibliographyData object. + """ + res_dict = {} for i in self.index: @@ -552,19 +830,47 @@ def to_pybtex(self): return bib_data + def to_bibtex(self): + """ + Converts the Results DataFrame to a Bibtex-formatted (.bib) bibliography string. - def to_bibtex(self): + Returns + ------- + bib_data : str + the Results DataFrame in Bibtex (.bib) bibliography file formatting. + """ bib_data = self.to_pybtex() return bib_data.to_string('bibtex') def to_yaml(self): + + """ + Converts the Results DataFrame to a YAML-formatted (.yaml) bibliography string. + + Returns + ------- + bib_data : str + the Results DataFrame in YAML (.yaml) bibliography file formatting. + """ + bib_data = self.to_pybtex() return bib_data.to_string('yaml') def export_bibtex(self, file_name = 'request_input', folder_path = 'request_input'): + """ + Exports the Results DataFrame as a Bibtex-formatted (.bib) bibliography file. + + Parameters + ---------- + file_name : str + name of file to create. Defaults to requesting from user input. + folder_path : str + location to create file. Defaults to requesting from user input. + """ + if file_name == 'request_input': file_name = input('File name: ') @@ -583,9 +889,19 @@ def export_bibtex(self, file_name = 'request_input', folder_path = 'request_inpu with open(filepath, 'w') as file: file.write(bib_bytes) - def export_yaml(self, file_name = 'request_input', folder_path = 'request_input'): + """ + Exports the Results DataFrame as a YAML-formatted (.yaml) bibliography file. + + Parameters + ---------- + file_name : str + name of file to create. Defaults to requesting from user input. + folder_path : str + location to create file. Defaults to requesting from user input. + """ + if file_name == 'request_input': file_name = input('File name: ') @@ -606,6 +922,15 @@ def export_yaml(self, file_name = 'request_input', folder_path = 'request_input' def clear_rows(self): + """ + Deletes all rows. + + Returns + ------- + self : Results + a blank Results DataFrame. + """ + results = Results() self.__dict__.update(results.__dict__) @@ -613,19 +938,57 @@ def clear_rows(self): def import_bibtex(self, file_path = 'request_input'): + """ + Reads a Bibtex (.bib) bibliography file and adds its data to the Results DataFrame. + + Parameters + ---------- + file_path : str + location of the Bibtex (.bib) bibliography file to read. + """ + df = import_bibtex(file_path = file_path) self.add_dataframe(dataframe=df, drop_duplicates=False, drop_empty_rows=False) def from_bibtex(file_path = 'request_input'): + """ + Reads a Bibtex (.bib) bibliography file and returns as a Results DataFrame. + + Parameters + ---------- + file_path : str + location of the Bibtex (.bib) bibliography file to read. + + Returns + ------- + results : Results + a Results DataFrame. + """ + results = Results() results.import_bibtex(file_path=file_path) return results - def import_excel(self, file_path = 'request_input', sheet_name = None): + """ + Reads an Excel (.xlsx) file and adds its data to the Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of Excel sheet to read. + + Returns + ------- + self : Results + a Results DataFrame. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -651,6 +1014,22 @@ def import_excel(self, file_path = 'request_input', sheet_name = None): def from_excel(file_path = 'request_input', sheet_name = None): # type: ignore + """ + Reads an Excel (.xlsx) file and returns as a Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of Excel sheet to read. + + Returns + ------- + results_table : Results + a Results DataFrame. + """ + results_table = Results() results_table = results_table.import_excel(file_path, sheet_name).replace(np.nan, None) # type: ignore results_table.format_authors() # type: ignore @@ -660,6 +1039,20 @@ def from_excel(file_path = 'request_input', sheet_name = None): # type: ignore def import_csv(self, file_path = 'request_input'): + """ + Reads a CSV (.csv) file and adds its data to the Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + self : Results + a Results object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -681,14 +1074,41 @@ def import_csv(self, file_path = 'request_input'): def from_csv(file_path = 'request_input'): # type: ignore + """ + Reads a CSV (.csv) file and returns as a Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + results_table : Results + a Results object. + """ + results_table = Results() results_table.import_csv(file_path).replace(np.nan, None) # type: ignore - return results_table def import_json(self, file_path = 'request_input'): + """ + Reads a JSON (.json) file and adds its data to the Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + self : Results + a Results object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -712,6 +1132,20 @@ def import_json(self, file_path = 'request_input'): def from_json(file_path = 'request_input'): # type: ignore + """ + Reads a JSON (.json) file and returns as a Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + results_table : Results + a Results object. + """ + results_table = Results() results_table.import_json(file_path).replace(np.nan, None) # type: ignore @@ -719,6 +1153,25 @@ def from_json(file_path = 'request_input'): # type: ignore def import_file(self, file_path = 'request_input', sheet_name = None): + """ + Reads a file, determines its file type, and adds its data to the Results object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of an Excel sheet to read (if one exists). + + Notes + ----- + Can read: + * .xlsx + * .csv + * .json + * .bib + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -735,12 +1188,34 @@ def import_file(self, file_path = 'request_input', sheet_name = None): if suffix.strip('.') == 'json': return self.import_json(file_path) + + if suffix.strip('.') == 'bib': + return self.import_bibtex(file_path) else: raise ValueError('File does not exist') def from_file(file_path = 'request_input', sheet_name = None): # type: ignore + """ + Reads a file, determines its file type, and returns its data as a Results object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of an Excel sheet to read (if one exists). + + Notes + ----- + Can read: + * .xlsx + * .csv + * .json + * .bib + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -765,11 +1240,89 @@ def from_file(file_path = 'request_input', sheet_name = None): # type: ignore def import_jstor(self, file_path = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_work_ids = True): + """ + Reads a file outputted by JSTOR's Constellate portal and adds its data to the Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + update_work_ids : bool + whether to add and/or update work IDs. Defaults to True. + + Notes + ----- + Can read: + * .csv + * .json + """ + df = import_jstor(file_path = file_path) self.add_dataframe(dataframe=df, drop_empty_rows = drop_empty_rows, drop_duplicates = drop_duplicates, update_work_ids = update_work_ids) + def from_jstor(self, file_path = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_work_ids = True): + + """ + Reads a file outputted by JSTOR's Constellate portal and returns as a Results DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + update_work_ids : bool + whether to add and/or update work IDs. Defaults to True. + + Returns + ------- + results : Results + a Results object. + + Notes + ----- + Can read: + * .csv + * .json + """ + + results = Results() + results.import_jstor(file_path = file_path, drop_empty_rows = drop_empty_rows, drop_duplicates = drop_duplicates, update_work_ids = update_work_ids) + + return results + def search_field(self, field = 'request_input', any_kwds = 'request_input', all_kwds = None, not_kwds = None, case_sensitive = False, output = 'Results'): + """ + Searches a given field in the Results DataFrame for a string. + + Parameters + ---------- + field : str + name of field to search. Defaults to requesting from user input. + any_kwds : str or list + one or more keywords to search for. Returns results where *any* matches are found. Defaults to requesting from user input. + all_kwds : str or list + one or more keywords to search for. Returns results where *all* matches are found. Defaults to None. + not_kwds : str or list + one or more keywords to search for. Returns results where *no* matches are found. Defaults to None. + case_sensitive : bool + whether to pay attention to the case of string data. Defaults to False. + output : str + the type of object to return. Defaults to Results. + + Returns + ------- + output : Results or pandas.DataFrame + search results. + """ + if field == 'request_input': field = input('Field: ') @@ -850,6 +1403,30 @@ def search_field(self, field = 'request_input', any_kwds = 'request_input', all_ def search(self, fields = 'all', any_kwds = 'request_input', all_kwds = None, not_kwds = None, case_sensitive = False, output = 'Results'): + """ + Searches for a string throughout the Results DataFrame. + + Parameters + ---------- + fields : str or list + names of one or fields to search. Defaults to 'all'. + any_kwds : str or list + one or more keywords to search for. Returns results where *any* matches are found. Defaults to requesting from user input. + all_kwds : str or list + one or more keywords to search for. Returns results where *all* matches are found. Defaults to None. + not_kwds : str or list + one or more keywords to search for. Returns results where *no* matches are found. Defaults to None. + case_sensitive : bool + whether to pay attention to the case of string data. Defaults to False. + output : str + the class of object to output. Defaults to Results. + + Returns + ------- + output : Results or pandas.DataFrame + search results. + """ + if any_kwds == 'request_input': any_kwds = input('Any keywords: ') any_kwds = any_kwds.strip().split(',') @@ -912,7 +1489,11 @@ def search(self, fields = 'all', any_kwds = 'request_input', all_kwds = None, no return output_df def get_keywords(self): - + + """ + Returns a Pandas Series containing all keywords associated with results in the Results DataFrame. + """ + output = [] for i in self['keywords']: @@ -922,25 +1503,61 @@ def get_keywords(self): if type(i) == list: output = output + i - output = pd.Series(output).str.strip().str.lower() + output = pd.Series(output).dropna() + output = output.astype(str) + output = output.str.strip().str.lower() output = output.drop(output[output.values == 'none'].index).reset_index().drop('index', axis=1)[0] # type: ignore return output - def get_keywords_list(self): + def get_keywords_list(self): + + """ + Returns a list containing all keywords associated with results in the Results DataFrame. + """ + return self.get_keywords().to_list() def get_keywords_set(self): + + """ + Returns a set containing all unique keywords associated with results in the Results DataFrame. + """ + return set(self.get_keywords_list()) def keyword_frequencies(self): + + """ + Returns a Pandas Series containing the frequencies of all keywords associated with results in the Results DataFrame. + """ + return self.get_keywords().value_counts() def keyword_stats(self): + + """ + Returns a Pandas Series containing summary statistics for the frequency of keywords associated with results in the Results DataFrame. + """ + return self.keyword_frequencies().describe() def get_titles_words(self, ignore_stopwords = True): + """ + Returns a list containing all words used in all titles across the Results DataFrame. + + Parameters + ---------- + ignore_stopwords : bool + whether to remove stopwords from the list. Uses the 'all' dataset from the art.datasets.stopwords.stopwords dictionary. Defaults to True. + + Returns + ------- + output : list + a list containing all words used in all titles across the Results DataFrame. + """ + df = self.copy(deep=True) df['title'] = df['title'].astype(str).str.lower().str.strip() @@ -960,16 +1577,70 @@ def get_titles_words(self, ignore_stopwords = True): return output def get_titles_words_set(self, ignore_stopwords = True): + + """ + Returns a set containing unique words used in titles across the Results DataFrame. + + Parameters + ---------- + ignore_stopwords : bool + whether to remove stopwords from the set. Uses the 'all' dataset from the art.datasets.stopwords.stopwords dictionary. Defaults to True. + + Returns + ------- + output : set + a set containing unique words used in titles across the Results DataFrame. + """ + return set(self.get_titles_words(ignore_stopwords = ignore_stopwords)) def title_word_frequencies(self, ignore_stopwords = True): + + """ + Returns a Pandas Series containing the frequencies of words used in titles across the Results DataFrame. + + Parameters + ---------- + ignore_stopwords : bool + whether to ignore stopwords. Uses the 'all' dataset from the art.datasets.stopwords.stopwords dictionary. Defaults to True. + + Returns + ------- + frequencies : pandas.Series + a Pandas Series containing the frequencies of words used in titles across the Results DataFrame. + """ + return pd.Series(self.get_titles_words(ignore_stopwords = ignore_stopwords)).value_counts() def title_words_stats(self, ignore_stopwords = True): + + """ + Returns a Pandas Series containing summary statistics for the frequency of words used in titles across the Results DataFrame. + + Parameters + ---------- + ignore_stopwords : bool + whether to ignore stopwords. Uses the 'all' dataset from the art.datasets.stopwords.stopwords dictionary. Defaults to True. + + Returns + ------- + frequencies : pandas.Series + a Pandas Series containing summary statistics for the frequency of words used in titles across the Results DataFrame. + """ + return self.title_word_frequencies(ignore_stopwords = ignore_stopwords).describe() - def drop_containing_keywords(self, keywords): + def drop_containing_keywords(self, keywords: list): + """ + Removes all rows which contain any of the inputted keywords. + + Parameters + ---------- + keywords : list + a list of keywords to search for. + """ + if type(keywords) == str: keywords = [keywords] @@ -978,6 +1649,20 @@ def drop_containing_keywords(self, keywords): def filter_by_keyword_frequency(self, cutoff = 3): + """ + Filters the Results DataFrame to show only results which contain keywords that meet a frequency cutoff. + + Parameters + ---------- + cutoff : int + a frequency cutoff for keywords. + + Returns + ------- + output : Results or pandas.DataFrame + the filtered DataFrame. + """ + keywords_freq = self.keyword_frequencies() frequent_kws = keywords_freq[keywords_freq.values > cutoff] # type: ignore @@ -996,13 +1681,49 @@ def filter_by_keyword_frequency(self, cutoff = 3): return self.loc[output.index] def has_citations_data(self): + + """ + Returns all Results entries which contain citations data. + """ + return self[~self['citations_data'].isna()] def has(self, column): + + """ + Returns all Results entries which contain data in the inputted column. + + Parameters + ---------- + column : str + name of column to filter on. + + Returns + ------- + self : Results + the masked Results DataFrame. + """ + return self[~self[column].isna()] def contains(self, query: str = 'request_input', ignore_case: bool = True) -> bool: + """ + Checks if the Results DataFrame contains an inputted string. Returns True if yes; else, returns False. + + Parameters + ---------- + query : str + a string to check for. Defaults to requesting from user input. + ignore_case : bool + whether to ignore the case of string data. Defaults to True. + + Returns + ------- + result : bool + whether the Results DataFrame contains the string. + """ + if query == 'request_input': query = input('Search query').strip() @@ -1080,6 +1801,22 @@ def contains(self, query: str = 'request_input', ignore_case: bool = True) -> bo def mask_affiliations(self, query: str = 'request_input', ignore_case: bool = True): + """ + Filters the Results DataFrame for entries which contain an inputted string in their affiliations data. + + Parameters + ---------- + query : str + a string to search for. Defaults to requesting from user input. + ignore_case : bool + whether to ignore the case of string data. Defaults to True. + + Returns + ------- + output : Results or pandas.DataFrame + the filtered DataFrame. + """ + if query == 'request_input': query = input('Search query: ').strip() @@ -1102,6 +1839,22 @@ def affil_masker(authors): def mask_entities(self, column, query: str = 'request_input', ignore_case: bool = True): + """ + Filters the Results DataFrame for entries which contain an inputted string in their authors or funders data. + + Parameters + ---------- + query : str + a string to search for. Defaults to requesting from user input. + ignore_case : bool + whether to ignore the case of string data. Defaults to True. + + Returns + ------- + output : Results or pandas.DataFrame + the filtered DataFrame. + """ + if query == 'request_input': query = input('Search query').strip() @@ -1123,6 +1876,10 @@ def entity_masker(entities): def format_funders(self, use_api: bool = False): + """ + Formats all funders data as Funders objects. + """ + try: funders = self['funder'].apply(format_funders) # type: ignore except: diff --git a/art/classes/review.py b/art/classes/review.py index 46f43b8..39cb6f9 100644 --- a/art/classes/review.py +++ b/art/classes/review.py @@ -16,13 +16,12 @@ from .affiliations import Affiliation, Affiliations, format_affiliations from .funders import Funders, format_funders from .results import Results, Funder, generate_work_id -from .references import References, is_formatted_reference, extract_references +from .references import References, is_formatted_reference, format_references from .activitylog import ActivityLog from .authors import Author, Authors, format_authors as orig_format_authors from .networks import Network, Networks from .citation_crawler import citation_crawler, academic_scraper - import copy import pickle from pathlib import Path @@ -35,6 +34,21 @@ def add_pdf(self, path = 'request_input'): + """ + Imports a PDF file using a filepath and adds to the Results dataframe. + + Parameters + ---------- + path : str + filepath for PDF. Requests input if none passed explicitly. + + Returns + ------- + self : Results + a Results object. + """ + + if path == 'request_input': path = input('Path to PDF (URL or filepath): ') @@ -56,8 +70,22 @@ def add_pdf(self, path = 'request_input'): def add_row(self, data): + """ + Adds inputted data as a row to Results dataframe. + + Parameters + ---------- + data : pandas.Series + data to add to Results. + + Returns + ------- + self : Results + a Results object. + """ + if type(data) != pd.Series: - raise TypeError(f'Results must be a Pandas.Series, not {type(data)}') + raise TypeError(f'Results must be a pandas.Series, not {type(data)}') data.index = data.index.astype(str).str.lower().str.replace(' ', '_') if len(data) != len(self.columns): @@ -70,7 +98,6 @@ def add_row(self, data): work_id = generate_work_id(data) work_id = self.get_unique_id(work_id, index) data['work_id'] = work_id - self.loc[index] = data self.format_authors() @@ -79,8 +106,30 @@ def add_row(self, data): def add_dataframe(self, dataframe: pd.DataFrame, drop_duplicates = False, drop_empty_rows = False, update_work_ids = True, format_authors = False): + """ + Merges inputted dataframe with Results dataframe. + + Parameters + ---------- + dataframe : pandas.DataFrame + dataframe to add to Results. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_work_ids : bool + whether to update results work ID's. + format_authors : bool + whether to format author data. + + Returns + ------- + self : Results + a Results object. + """ + if (type(dataframe) != pd.DataFrame) and (type(dataframe) != pd.Series): - raise TypeError(f'Results must be a Pandas.Series or Pandas.DataFrame, not {type(dataframe)}') + raise TypeError(f'Results must be a pandas.Series or pandas.DataFrame, not {type(dataframe)}') dataframe = dataframe.reset_index().drop('index', axis=1) dataframe.columns = dataframe.columns.astype(str).str.lower().str.replace(' ', '_') @@ -116,16 +165,39 @@ def add_dataframe(self, dataframe: pd.DataFrame, drop_duplicates = False, drop_ Results.add_dataframe = add_dataframe # type: ignore def has_formatted_citations(self): + + """ + Returns all results entries which contain properly formatted citations. + """ + return self[self['citations'].apply(is_formatted_reference)] Results.has_formatted_citations = has_formatted_citations # type: ignore def lacks_formatted_citations(self): + + """ + Returns all results entries which lack properly formatted citations. + """ + return self[~self['citations'].apply(is_formatted_reference)] Results.lacks_formatted_citations = lacks_formatted_citations # type: ignore def format_citations(self, add_work_ids = False, update_from_doi = False, verbose = True): + + """ + Formats all results entries' citations data as References objects. + + Parameters + ---------- + add_work_ids : bool + whether to add new work ID's to results entries. + update_from_doi : bool + whether to update results data from DOI's. + verbose : bool + whether to print dialogue during formatting. + """ self['citations'] = self['citations'].replace({np.nan: None}) self['citations_data'] = self['citations_data'].replace({np.nan: None}) @@ -148,7 +220,7 @@ def format_citations(self, add_work_ids = False, update_from_doi = False, verbos indices = unformatted.index processing_count = 0 for i in indices: - refs = extract_references(self.loc[i, 'citations_data'], add_work_ids = add_work_ids, update_from_doi = update_from_doi) + refs = format_references(self.loc[i, 'citations_data'], add_work_ids = add_work_ids, update_from_doi = update_from_doi) refs_count = None if 'refs_count' in refs.__dict__.keys(): @@ -175,6 +247,10 @@ def format_citations(self, add_work_ids = False, update_from_doi = False, verbos def format_authors(self): + """ + Formats all results entries' authors data as Authors objects. + """ + if len(self[self['authors_data'].isna()]) < len(self['authors_data']): authors_data = self['authors_data'] @@ -191,6 +267,26 @@ def format_authors(self): def add_citations_to_results(self, add_work_ids = False, update_from_doi = False, drop_duplicates = False, drop_empty_rows = True): + """ + Formats all results entries' citations and adds them to the Results object. + + Parameters + ---------- + add_work_ids : bool + whether to add work ID's to newly added results entries. Defaults to False. + update_from_doi : bool + whether to update results data from DOI's. Defaults to False. + drop_empty_rows : bool + whether to remove duplicate rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + + Returns + ------- + self : Results + a Results object. + """ + if drop_empty_rows == True: self.drop_empty_rows() @@ -227,7 +323,7 @@ def add_citations_to_results(self, add_work_ids = False, update_from_doi = False class Review: """ - This is a Review object. It stores the data from academic reviews. + This is a Review object. It stores data from academic reviews. Parameters ---------- @@ -237,6 +333,23 @@ class Review: file location associated with Review. file_type : str file type associated with Review file(s). + + Attributes + ---------- + properties : Properties + metadata associated with this Review object. + results : Results + data on publications. + authors : Authors + data on authors. + funders : Funders + data on funders. + affiliations : Affiliations + data on authors' affiliate organisations. + networks : Networks + network objects derived from Review data. + activity_log : ActivityLog + metadata logging changes to the Review, including: additions, deletions, crawling, and searches. """ results = Results() @@ -290,7 +403,7 @@ def update_properties(self): def __repr__(self): """ - Defines how Reviews are represented in string form. + Defines how Review objects are represented in string form. """ output = f'\n\n{"-"*(13+len(self.properties.review_name))}\nReview name: {self.properties.review_name}\n{"-"*(13+len(self.properties.review_name))}\n\nProperties:\n-----------\n{self.properties}\n\nDescription:\n------------\n\n{self.description}\n\nResults:\n--------\n\n{self.results}\n\nAuthors:\n--------\n\n{self.authors.summary.head(10)}\n\nFunders:\n--------\n\n{self.funders.summary.head(10)}\n\n' @@ -309,6 +422,11 @@ def __getitem__(self, key): """ Retrieves Review contents or results using an index/key. + + Returns + ------- + item : object + item associated with the inputted key. """ if key in self.__dict__.keys(): @@ -349,18 +467,37 @@ def contents(self): """ Returns the Review's attributes as a list. + + Returns + ------- + contents : list + the names of the Review object's attributes. """ return self.__dict__.keys() def __len__(self): + """ + Returns the number of entries in the Results table. + + Returns + ------- + result : int + the number of results entries contained in the Results dataframe. + """ + return len(self.results) def count_results(self): """ Returns the number of entries in the Results table. + + Returns + ------- + result : int + the number of results entries contained in the Results dataframe. """ return len(self.results) @@ -369,6 +506,11 @@ def to_list(self): """ Returns the Review as a list. + + Returns + ------- + result : list + Review object formatted as a list. """ return [i for i in self] @@ -377,6 +519,11 @@ def to_dict(self): """ Returns the Review as a dictionary. Excludes the Review's 'properties' attribute. + + Returns + ------- + output_dict : dict + Review object formatted as a dictionary. """ output_dict = {} @@ -386,37 +533,94 @@ def to_dict(self): return output_dict def to_bibtex(self): + + """ + Returns an object containing Results data in bibtex format. + """ + return self.results.to_bibtex() def to_yaml(self): + + """ + Returns an object containing Results data in .yaml format. + """ + return self.results.to_yaml() def export_bibtex(self, file_name = 'request_input', folder_path= 'request_input'): + + """ + Exports Results data as a .bib file. + + Parameters + ---------- + file_name : str + name for export file. Defaults to requesting from user input. + folder_path : str + directory path for folder to export to. Defaults to requesting from user input. + """ + return self.results.export_bibtex(file_name=file_name, folder_path=folder_path) def export_yaml(self, file_name = 'request_input', folder_path= 'request_input'): + + """ + Exports Results data as a .yaml file. + + Parameters + ---------- + file_name : str + name for export file. Defaults to requesting from user input. + folder_path : str + directory path for folder to export to. Defaults to requesting from user input. + """ + return self.results.export_yaml(file_name=file_name, folder_path=folder_path) def copy(self): """ - Returns the a copy of the Review. + Returns the a copy of the Review object. """ return copy.deepcopy(self) - def get_result(self, row_position, column_position = None): + def get_result(self, index_position, column_position = None): """ - Returns a result when given its attribute name. + Returns a result when given its index position, with an option for specifying column position. + Equivalent to pandas.DataFrame.loc[...] + + Parameters + ---------- + index_position : int + index position of result entry to return from Results dataframe. + column_position : object + name of column of datapoint to return from Results dataframe. + + Returns + ------- + result : object + the selected object. """ if column_position == None: - return self.results.loc[row_position] + return self.results.loc[index_position] else: - return self.results.loc[row_position, column_position] + return self.results.loc[index_position, column_position] def get_affiliations_dict(self): + + """ + Returns the all affiliations associated with Authors objects as a dictionary. + + Returns + ------- + result : Affiliations + Affiliations associated with Authors objects. + """ + return self.authors.affiliations() def get_name_str(self): @@ -440,6 +644,22 @@ def get_name_str(self): def add_pdf(self, path = 'request_input', update_formatting: bool = True): + """ + Reads a PDF and adds its data to the Results dataframe. + + Parameters + ---------- + path : str + file path for PDF to read. Defaults to request from user input. + update_formatting : bool + whether to format the added data (e.g., citations, authors, funders, and affiliations) + + Returns + ------- + self : Review + a Review object. + """ + old_res_len = len(self.results) self.results.add_pdf(path) # type: ignore new_res_len = len(self.results) @@ -453,11 +673,18 @@ def add_pdf(self, path = 'request_input', update_formatting: bool = True): self.update_properties() + return self + def varstr(self): """ Returns the Review's name as a string. Defaults to using its variable name; falls back to using its name property. + Returns + ------- + string : str + the Review's name as a string. + Notes ----- * Searches global environment dictionary for objects sharing Review's ID. Returns key if found. @@ -483,11 +710,54 @@ def varstr(self): return string - def to_dataframe(self): - return self.results.to_dataframe() # type: ignore + def to_dataframe(self, attribute: str = 'results'): + + """ + Returns one of the Review's datasets as a Pandas DataFrame. Defaults to returning the Results dataset. + + Parameters + ---------- + attribute : str + the name of the Review dataset to return. + + Returns + ------- + df : pandas.DataFrame + the dataset as a pandas dataframe. + """ + + df = pd.DataFrame(dtype=object) + + if attribute.lower() == 'results': + df = self.results.to_dataframe() # type: ignore + + if attribute.lower() == 'authors': + df = self.authors.to_dataframe() # type: ignore + + if attribute.lower() == 'funders': + df = self.funders.to_dataframe() # type: ignore + + if (attribute.lower() == 'affiliations') or (attribute.lower() == 'affils'): + df = self.affiliations.to_dataframe() # type: ignore + + return df def from_dataframe(dataframe: pd.DataFrame): # type: ignore + """ + Creates a Review object from a Pandas DataFrame. + + Parameters + ---------- + dataframe : pandas.DataFrame + a Pandas DataFrame to use. + + Returns + ------- + review : Review + a Review object. + """ + review = Review() review.results = Results.from_dataframe(dataframe) # type: ignore review.format() # type: ignore @@ -496,6 +766,10 @@ def from_dataframe(dataframe: pd.DataFrame): # type: ignore def format_funders(self): + """ + Formats results entries' funders data into Funders objects and stores in Review's Funders attribute. + """ + self.results.format_funders() # type: ignore funders_data = self.results['funder'].to_list() @@ -528,6 +802,11 @@ def format_funders(self): continue def format_affiliations(self): + + """ + Formats authors' affiliations data as Affiliations objects and stores in Review's Affiliations attribute. + """ + self.authors.format_affiliations() affils_data = self.authors.summary['affiliations'].to_list() @@ -563,10 +842,31 @@ def format_affiliations(self): continue def format_citations(self, add_work_ids = False, update_from_doi = False, verbose=True): + + """ + Formats results entries' citations data into References objects. + + Parameters + ---------- + add_work_ids : bool + whether to add work ID's to References entries. + """ + self.results.format_citations(add_work_ids = add_work_ids, update_from_doi=update_from_doi, verbose=verbose) # type: ignore def format_authors(self, drop_duplicates = False, drop_empty_rows=True): + """ + Formats results entries' authors data into Authors objects and stores in Review's Authors attribute. + + Parameters + ---------- + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + self.results.format_authors() # type: ignore authors_data = self.results['authors'].to_list() @@ -590,6 +890,19 @@ def format_authors(self, drop_duplicates = False, drop_empty_rows=True): def update_author_attrs(self, ignore_case: bool = True, drop_duplicates = False, drop_empty_rows=True): + """ + Formats authors entries, identifies their publications, and stores these. + + Parameters + ---------- + ignore_case : bool + whether to ignore the case of string data. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + self.authors.sync(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) auths_data = self.authors.summary[['author_id', 'orcid', 'google_scholar', 'crossref', 'scopus', 'full_name']] @@ -646,6 +959,15 @@ def update_author_attrs(self, ignore_case: bool = True, drop_duplicates = False, def update_funder_attrs(self, ignore_case: bool = True): + """ + Formats funders entries, identifies their publications, and stores these. + + Parameters + ---------- + ignore_case : bool + whether to ignore the case of string data. + """ + self.funders.sync_all() f_data = self.funders.summary[['funder_id', 'uri', 'crossref_id', 'website','name']] @@ -701,6 +1023,15 @@ def update_funder_attrs(self, ignore_case: bool = True): def update_affiliation_attrs(self, update_authors: bool = True, ignore_case: bool = True): + """ + Formats affiliations entries, identifies their publications, and stores these. + + Parameters + ---------- + ignore_case : bool + whether to ignore the case of string data. + """ + if update_authors == True: self.update_author_attrs(ignore_case=ignore_case) @@ -760,11 +1091,46 @@ def update_affiliation_attrs(self, update_authors: bool = True, ignore_case: boo def update_entity_attrs(self, ignore_case: bool = True): + """ + Formats authors, funders, and affiliations entries; identifies their publications; and stores these + + Parameters + ---------- + ignore_case : bool + whether to ignore the case of string data. + """ + self.update_author_attrs(ignore_case=ignore_case) self.update_affiliation_attrs(update_authors=False, ignore_case=ignore_case) self.update_funder_attrs(ignore_case=ignore_case) def get_coauthors(self, format: bool = True, update_attrs: bool = True, ignore_case: bool = True, add_to_authors: bool = True, drop_duplicates = False, drop_empty_rows=True): + + """ + Returns a dictionary of co-authors. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author attributes. + ignore_case : bool + whether to ignore the case of string data. + add_to_authors : bool + whether to store the dict of co-authors in the Review's Authors attribute. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + + Returns + ------- + output : dict + a dictionary containing co-authors. + * Keys: author IDs + * Values: co-authors + """ if format == True: self.format(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -815,6 +1181,32 @@ def get_coauthors(self, format: bool = True, update_attrs: bool = True, ignore_c def get_cofunders(self, format: bool = True, update_attrs: bool = True, ignore_case: bool = True, add_to_funders: bool = True): + """ + Returns a dictionary of co-funders. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update funder attributes. + ignore_case : bool + whether to ignore the case of string data. + add_to_funders : bool + whether to store the dict of co-funders in the Review's Funders attribute. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + + Returns + ------- + output : dict + a dictionary containing co-funders. + * Keys: funder IDs + * Values: co-funders + """ + if format == True: self.format() @@ -864,6 +1256,17 @@ def get_cofunders(self, format: bool = True, update_attrs: bool = True, ignore_c def remove_duplicates(self, drop_empty_rows=True, use_api=False): + """ + Removes duplicate data entries from results, authors, funders, and affiliations datasets. + + Parameters + ---------- + drop_empty_rows : bool + whether to remove rows which do not contain any data. + use_api : bool + whether to update data using CrossRef, Orcid, and other APIs. + """ + orig_res_len = len(self.results) self.results.remove_duplicates(drop_empty_rows=drop_empty_rows, update_from_api=use_api) # type: ignore new_res_len = len(self.results) @@ -894,6 +1297,21 @@ def remove_duplicates(self, drop_empty_rows=True, use_api=False): def format(self, update_entities = False, drop_duplicates = False, drop_empty_rows=True, verbose=False): + """ + Parses and formats all datasets (i.e., results, authors, funders and affiliations). + + Parameters + ---------- + update_attrs : bool + whether to update entity attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + verbose : bool + whether to print formatting dialogue. + """ + self.format_funders() self.format_citations(verbose=verbose) self.format_authors(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -925,6 +1343,19 @@ def format(self, update_entities = False, drop_duplicates = False, drop_empty_ro def add_citations_to_results(self, update_formatting: bool = True, drop_duplicates = False, drop_empty_rows = True): + """ + Formats all results entries' citations and adds them to the Review's Results attribute. + + Parameters + ---------- + update_formatting : bool + whether to format results, authors, funders, and affiliations data. + drop_empty_rows : bool + whether to remove duplicate rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + """ + self.results.add_citations_to_results(drop_duplicates = drop_duplicates, drop_empty_rows = drop_empty_rows) # type: ignore if drop_empty_rows == True: @@ -952,7 +1383,20 @@ def add_citations_to_results(self, update_formatting: bool = True, drop_duplicat def update_from_orcid(self, update_formatting: bool = True, drop_duplicates = False, drop_empty_rows=True): - orcid_len = len(self.authors.with_orcid()) + """ + Updates Authors data using the Orcid API. + + Parameters + ---------- + update_formatting : bool + whether to format results, authors, funders, and affiliations data. + drop_empty_rows : bool + whether to remove duplicate rows. Defaults to False. + drop_empty_rows : bool + whether to remove rows which do not contain any data. Defaults to False. + """ + + orcid_len = len(self.authors.has_orcid()) old_auths_len = len(self.authors.summary) self.authors.update_from_orcid(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -968,6 +1412,28 @@ def update_from_orcid(self, update_formatting: bool = True, drop_duplicates = Fa def add_dataframe(self, dataframe: pd.DataFrame, drop_empty_rows = False, drop_duplicates = False, update_formatting: bool = True): + """ + Merges inputted dataframe with Review's Results dataset. + + Parameters + ---------- + dataframe : pandas.DataFrame + dataframe to add to Results. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_work_ids : bool + whether to update results entries' work ID's. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + + Returns + ------- + self : Review + a Review object. + """ + orig_len = len(self.results) self.results.add_dataframe(dataframe=dataframe, drop_empty_rows=drop_empty_rows, drop_duplicates=drop_duplicates) # type: ignore new_len = len(self.results) @@ -995,8 +1461,25 @@ def add_dataframe(self, dataframe: pd.DataFrame, drop_empty_rows = False, drop_d return self - def import_bibtex(self, file_path = 'request_input', update_formatting: bool = False, update_entities = False): + def import_bibtex(self, file_path = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_formatting: bool = False, update_entities = False): + """ + Reads a Bibtex (.bib) bibliography file and adds its data to Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1010,19 +1493,74 @@ def import_bibtex(self, file_path = 'request_input', update_formatting: bool = F self.properties.file_location = file_path self.properties.update_file_type() + + if drop_duplicates == True: + self.remove_duplicates(drop_empty_rows = drop_empty_rows) + + if update_formatting == True: + self.format(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + + if update_entities == True: + self.update_entity_attrs() - def from_bibtex(file_path = 'request_input', update_formatting: bool = False, update_entities = False): + def from_bibtex(file_path = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_formatting: bool = False, update_entities = False): + + """ + Reads .bib file and returns a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + """ if file_path == 'request_input': file_path = input('File path: ') review = Review(file_location=file_path) - review.import_bibtex(file_path=file_path, update_formatting=update_formatting, update_entities=update_entities) + review.import_bibtex(file_path=file_path, drop_empty_rows = drop_empty_rows, drop_duplicates = drop_duplicates, update_formatting=update_formatting, update_entities=update_entities) return review def import_excel(self, file_path = 'request_input', sheet_name = None, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Reads an Excel (.xlsx) file and adds its data to the Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of Excel sheet to read. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1062,14 +1600,40 @@ def import_excel(self, file_path = 'request_input', sheet_name = None, update_fo return self - def from_excel(file_path = 'request_input', sheet_name = None, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + def from_excel(file_path = 'request_input', sheet_name = None, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + """ + Reads an Excel (.xlsx) file and returns as a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of Excel sheet to read. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_entities : bool + whether to update entity attributes. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + + Returns + ------- + review : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') review = Review(file_location=file_path) review.results = Results.from_excel(file_path, sheet_name) # type: ignore - review.format(update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) + + if update_formatting == True: + review.format(update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) review.activity_log.add_activity(type='data import', activity='created Review from imported Excel file', location=['results', 'authors', 'funders', 'affiliations']) @@ -1077,6 +1641,28 @@ def from_excel(file_path = 'request_input', sheet_name = None, update_entities = def import_csv(self, file_path = 'request_input', update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Reads a CSV (.csv) file and adds its data to the Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1118,11 +1704,34 @@ def import_csv(self, file_path = 'request_input', update_formatting: bool = True def from_csv(file_path = 'request_input', update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + """ + Reads a CSV (.csv) file and returns as a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') review = Review(file_location=file_path) review.results = Results.from_csv(file_path) # type: ignore + if update_formatting == True: review.format(update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -1132,6 +1741,22 @@ def from_csv(file_path = 'request_input', update_formatting: bool = True, update def import_json(self, file_path = 'request_input', update_formatting: bool = True): + """ + Reads a JSON (.json) file and adds its data to the Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + + Returns + ------- + self : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1148,6 +1773,20 @@ def import_json(self, file_path = 'request_input', update_formatting: bool = Tru def from_json(file_path = 'request_input'): # type: ignore + """ + Reads a JSON (.json) file and returns as a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + review : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1157,6 +1796,36 @@ def from_json(file_path = 'request_input'): # type: ignore return review def import_file(self, file_path = 'request_input', sheet_name = None, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + + """ + Reads a file, determines its file type, and adds its data to the Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of an Excel sheet to read. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Notes + ----- + Can read: + * .xlsx + * .csv + * .json + * .bib + * .yaml + * .txt + * .review + """ if file_path == 'request_input': file_path = input('File path: ') @@ -1179,6 +1848,41 @@ def import_file(self, file_path = 'request_input', sheet_name = None, update_for def from_file(file_path = 'request_input', sheet_name = None, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + """ + Reads a file, determines its file type, and returns its data as a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + sheet_name : str + optional: name of an Excel sheet to read. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + + Notes + ----- + Can read: + * .xlsx + * .csv + * .json + * .bib + * .yaml + * .txt + * .review + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1190,6 +1894,34 @@ def from_file(file_path = 'request_input', sheet_name = None, update_formatting: def import_jstor(self, file_path = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_work_ids = True, format_citations=True, format_authors = True, format_funders = True, format_affiliations=True): + """ + Reads a file outputted by JSTOR's Constellate portal and adds its data to the Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + + Notes + ----- + Can read: + * .csv + * .json + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1218,6 +1950,29 @@ def import_jstor(self, file_path = 'request_input', drop_empty_rows = False, dro def from_jstor(file_path: str = 'request_input', drop_empty_rows = False, drop_duplicates = False, update_work_ids = True, format_citations=True, format_authors = True, format_funders = True, format_affiliations=True): # type: ignore + """ + Reads a file outputted by JSTOR's Constellate portal and returns its data as a Review object. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Notes + ----- + Can read: + * .csv + * .json + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -1227,10 +1982,57 @@ def from_jstor(file_path: str = 'request_input', drop_empty_rows = False, drop_d return review def search_field(self, field = 'request_input', any_kwds = 'request_input', all_kwds = None, not_kwds = None, case_sensitive = False, output = 'Results'): + + """ + Searches a given field in the Results dataset for a string. + + Parameters + ---------- + field : str + name of field to search. Defaults to requesting from user input. + any_kwds : str or list + one or more keywords to search for. Returns results where *any* matches are found. Defaults to requesting from user input. + all_kwds : str or list + one or more keywords to search for. Returns results where *all* matches are found. Defaults to None. + not_kwds : str or list + one or more keywords to search for. Returns results where *no* matches are found. Defaults to None. + case_sensitive : bool + whether to pay attention to the case of string data. Defaults to False. + output : str + the type of object to return. Defaults to Results. + + Returns + ------- + output : Results or pandas.DataFrame + search results. + """ + return self.results.search_field(field = field, any_kwds = any_kwds, all_kwds = all_kwds, not_kwds = not_kwds, case_sensitive = case_sensitive, output = output) # type: ignore def search(self, any_kwds = 'request_input', all_kwds = None, not_kwds = None, fields = 'all', case_sensitive = False): + """ + Searches for a string throughout Review. + + Parameters + ---------- + any_kwds : str or list + one or more keywords to search for. Returns results where *any* matches are found. Defaults to requesting from user input. + all_kwds : str or list + one or more keywords to search for. Returns results where *all* matches are found. Defaults to None. + not_kwds : str or list + one or more keywords to search for. Returns results where *no* matches are found. Defaults to None. + fields : str or list + names of one or fields to search. Defaults to 'all'. + case_sensitive : bool + whether to pay attention to the case of string data. Defaults to False. + + Returns + ------- + output : pandas.DataFrame + search results. + """ + combined_query = str(any_kwds) if all_kwds is not None: combined_query = combined_query + str(all_kwds) @@ -1271,7 +2073,7 @@ def export_folder(self, folder_name = 'request_input', folder_address = 'request Parameters ---------- folder_name : str - name of folder to create. Defaults to using the object's variable name. + name of folder to create. Defaults to requesting from user input. folder_address : str directory address to create folder in. defaults to requesting for user input. export_str_as : str @@ -1282,6 +2084,25 @@ def export_folder(self, folder_name = 'request_input', folder_address = 'request file type for exporting Pandas objects. Defaults to 'csv'. export_network_as : str file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) """ if folder_name == 'request_input': @@ -1300,7 +2121,7 @@ def export_txt(self, new_file = True, file_name: str = 'request_input', folder_a Parameters ---------- file_name : str - name of file to create. Defaults to using the object's variable name. + name of file to create. Defaults to requesting from user input. file_address : str directory address to create file in. defaults to requesting for user input. """ @@ -1335,7 +2156,7 @@ def export_review(self, new_file = True, file_name: str = 'request_input', folde Parameters ---------- file_name : str - name of file to create. Defaults to using the object's variable name. + name of file to create. Defaults to requesting from user input. file_address : str directory address to create file in. defaults to requesting for user input. """ @@ -1374,6 +2195,48 @@ def save_as(self, export_pandas_as: str = 'csv', export_network_as: str = 'graphML'): + """ + Saves the Review to a new file with an inputted name at a specified location. + + Parameters + ---------- + filetype : str + type of file to save. Defaults to 'review'. + file_name : str + name of file to create. Defaults to requesting from user input. + folder_address : str + directory address of folder to create file in. defaults to requesting from user input. + export_str_as : str + file type for exporting string objects. Defaults to 'txt'. + export_dict_as : str + file type for exporting dictionary objects. Defaults to 'json'. + export_pandas_as : str + file type for exporting Pandas objects. Defaults to 'csv'. + export_network_as : str + file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + filetype: + * txt or + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) + """ + if file_name == 'request_input': file_name = input('File name: ') @@ -1451,6 +2314,40 @@ def save(self, export_pandas_as: str = 'csv', export_network_as: str = 'graphML'): + """ + Saves the Review to the filepath stored in its Properties attribute. + + Parameters + ---------- + export_str_as : str + file type for exporting string objects. Defaults to 'txt'. + export_dict_as : str + file type for exporting dictionary objects. Defaults to 'json'. + export_pandas_as : str + file type for exporting Pandas objects. Defaults to 'csv'. + export_network_as : str + file type for exporting network objects. Defaults to 'graphML'. + + Options + ------- + export_str_as: + * txt or .txt (Default) + export_dict_as: + * json or .json (Default) + * txt or .txt + export_pandas_as: + * csv or .csv (Default) + * xlsx or .xlsx or Excel + export_network_as: + * graphML or .graphML (Default) + * gml or .gml + * leda + * lgl + * ncol + * pajek + * kumu (i.e., formatted .json) + """ + file_path = self.properties.file_location if (file_path is None) or (file_path == ''): @@ -1499,6 +2396,15 @@ def save(self, def import_txt(self, file_path: str = 'request_input'): + """ + Imports data from a pickled .txt file and adds to the Review object. + + Parameters + ---------- + file_path : str + directory path of .txt file to import. + """ + if file_path == 'request_input': file_path = input('File address: ') @@ -1521,6 +2427,20 @@ def import_txt(self, file_path: str = 'request_input'): def from_txt(file_path: str = 'request_input'): # type: ignore + """ + Imports a Review from a pickled .txt file. + + Parameters + ---------- + file_path : str + directory path of .txt file to import. + + Returns + ------- + review : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File address: ') @@ -1534,6 +2454,20 @@ def from_txt(file_path: str = 'request_input'): # type: ignore def open(file_path: str = 'request_input'): # type: ignore + """ + Imports a Review from a .review or .txt file. + + Parameters + ---------- + file_path : str + directory path of .txt file to import. + + Returns + ------- + review : Review + a Review object. + """ + if file_path == 'request_input': file_path = input('File address: ') @@ -1548,25 +2482,90 @@ def open(file_path: str = 'request_input'): # type: ignore def scrape_article(self, url = 'request_input'): + """ + Scrapes article data from a given URL and adds to Results. + + Parameters + ---------- + url : str + url of article to scrape. Defaults to requesting from user input. + + Notes + ----- + This function is capable of scraping: + * Frontiers + * ArXiv + * Springer + * Nature + * IEEE + * PubMed + * PMC + * SSRN + * HeinOnline + * MDPI + * ACM + * Project Muse + * Proquest + * JSTOR + * Google Scholar + """ + if url == 'request_input': url = input('URL: ') df = scrape_article(url) - self.activity_log.add_activity(type='web scraping', activity='scraped URL and added to results', location=['results'], url=url) + self.activity_log.add_activity(type='web scraping', activity=f'scraped {url} and added to results', location=['results'], url=url) self.results.add_dataframe(df) # type: ignore def scrape_doi(self, doi = 'request_input'): + """ + Scrapes article data from a given DOI and adds to Results. + + Parameters + ---------- + doi : str + DOI of article to scrape. Defaults to requesting from user input. + + Notes + ----- + This function is capable of scraping: + * Frontiers + * ArXiv + * Springer + * Nature + * IEEE + * PubMed + * PMC + * SSRN + * HeinOnline + * MDPI + * ACM + * Project Muse + * Proquest + * JSTOR + * Google Scholar + """ + if doi == 'request_input': doi = input('doi or URL: ') df = scrape_doi(doi) url = f'https://doi.org/{doi}' - self.activity_log.add_activity(type='web scraping', activity='scraped DOI and added to results', location=['results'], url=url) + self.activity_log.add_activity(type='web scraping', activity=f'scraped {url} and added to results', location=['results'], url=url) self.results.add_dataframe(df) # type: ignore def scrape_google_scholar(self, url = 'request_input'): + """ + Scrapes article data from a given Google Scholar page and adds to Results. + + Parameters + ---------- + url : str + url of Google Scholar page to scrape. Defaults to requesting from user input. + """ + if url == 'request_input': url = input('URL: ') @@ -1576,6 +2575,15 @@ def scrape_google_scholar(self, url = 'request_input'): def scrape_google_scholar_search(self, url = 'request_input'): + """ + Scrapes article data from a given Google Scholar search and adds to Results. + + Parameters + ---------- + url : str + url of Google Scholar search to scrape. Defaults to requesting from user input. + """ + if url == 'request_input': url = input('URL: ') @@ -1585,6 +2593,21 @@ def scrape_google_scholar_search(self, url = 'request_input'): def scrape(self, url = 'request_input', add_to_results=True, drop_empty_rows = True, drop_duplicates = False): + """ + Scrapes website data from a given URL. + + Parameters + ---------- + url : str + url to scrape. Defaults to requesting from user input. + add_to_results : bool + whether to add scraped data to Results. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + """ + if url == 'request_input': url = input('URL: ') @@ -1617,11 +2640,61 @@ def search_crossref(self, select: list = None, # type: ignore sample: int = None, # type: ignore limit: int = None, # type: ignore - rate_limit: float = 0.1, + rate_limit: float = 0.05, timeout = 60, add_to_results = False ) -> pd.DataFrame: + """ + Searches CrossRef API and returns the results as a Pandas DataFrame. + + Parameters + ---------- + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + add_to_results : bool + whether to add search results to Review. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + df = search_works(bibliographic = bibliographic, title = title, author = author, @@ -1704,6 +2777,91 @@ def search_scopus(self, drop_duplicates = False, format=False): + """ + Searches Scopus API and returns the results as a Pandas DataFrame. + + Parameters + ---------- + tile_abs_key_auth : str + a combined search. Searches for titles, abstracts, keywords, and author names. Defaults to None. + all_fields : str + searches all fields. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + year : str + searches for matching publication years. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_identifier : str + searches for Scopus author IDs. Defaults to None. + affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + publisher : str + searches for publisher names containing string. Defaults to None. + funder : str + searches for funder names containing string. Defaults to None. + abstract : str + searches for abstracts containing string. Defaults to None. + keywords : str + searches for matching keywords. Defaults to None. + doctype : str + searches for types of entries containing string. Defaults to None. + doi : str + searches for matching DOIs. Defaults to None. + issn : str + searches for matching ISSNs. Defaults to None. + isbn : str + searches for matching ISBNs. Defaults to None. + pubmed_id : str + searches for matching PubMed IDs (PMIDs). Defaults to None. + source_title : str + searches for source titles (e.g. journals, books) containing string. Defaults to None. + volume : str + searches for journal entries with matching volume numbers. Defaults to None. + page : str + searches for entries with matching page numbers. Defaults to None. + issue : str + searches for journal entries with matching issue numbers. Defaults to None. + language : str + searches for entries by language Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + references : str + searches for entries with citations that contain matching strings. Defaults to None. + default_operator : str + the default Boolean operator to build the search. Defaults to 'AND' + add_to_results : bool + whether to add search results to Review. + drop_duplicates : bool + whether to remove duplicated rows when adding to results. + drop_empty_rows : bool + whether to remove rows which do not contain any data when adding to results. + format : bool + whether to format results, authors, funders, and affiliations data when adding to results. + refresh : bool + view : bool + verbose : bool + download : bool + integrity_fields : None + integrity_action : str + subscriber : bool + + Returns + ------- + df : pandas.DataFrame + results from Scopus API search. + + Options + ------- + Options for default_operator: + * 'AND' + * 'AND NOT' + * 'NOT' + * 'OR' + """ + df = search_scopus(tile_abs_key_auth = tile_abs_key_auth, all_fields = all_fields, title = title, @@ -1866,6 +3024,23 @@ def search_scopus(self, # return df def lookup_doi(self, doi = 'request_input', timeout = 60): + + """ + Looks up DOI using the CrossRef API. + + Parameters + ---------- + doi : str + DOI to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + df : pandas.DataFrame + results from DOI lookup on CrossRef API. + """ + return lookup_doi(doi=doi, timeout=timeout) def lookup_scopus(self, @@ -1878,6 +3053,32 @@ def lookup_scopus(self, drop_empty_rows = False ): + """ + Looks up publication using the Scopus API. + + Parameters + ---------- + uid : str + Scopus ID, DOI, ISBN, ISSN, or Pubmed ID (PMID) to look up. Defaults to requesting from user input. + refresh : bool + whether to refresh the Scopus session. + view : str + sets the amount of detail returned. Defaults to 'META'. + add_to_results : bool + whether to add results to Review. + drop_duplicates : bool + whether to remove duplicated rows when adding to results. + drop_empty_rows : bool + whether to remove rows which do not contain any data when adding to results. + id_type : None + + + Returns + ------- + df : pandas.DataFrame + results from publication lookup on Scopus API. + """ + if uid == 'request_input': uid = input('ID: ') @@ -1900,6 +3101,30 @@ def lookup_scopus(self, def add_doi(self, doi = 'request_input', timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Looks up DOI using the CrossRef API and adds to Review's results dataset. + + Parameters + ---------- + doi : str + DOI to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + old_len = len(self.results) self.results.add_doi(doi=doi, timeout=timeout) # type: ignore new_len = len(self.results) @@ -1936,16 +3161,85 @@ def add_doi(self, doi = 'request_input', timeout = 60, update_formatting: bool = def from_doi(doi: str = 'request_input', timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + """ + Looks up DOI using the CrossRef API and returns as a Review object. + + Parameters + ---------- + doi : str + DOI to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + """ + review = Review() review.add_doi(doi = doi, timeout = timeout, update_formatting = update_formatting, update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) return review - def lookup_dois(self, dois_list: list = [], rate_limit: float = 0.1, timeout = 60): + def lookup_dois(self, dois_list: list = [], rate_limit: float = 0.05, timeout = 60): + + """ + Looks up a list of DOIs using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + dois_list : list + list of DOIs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + + Returns + ------- + result : pandas.DataFrame + result of DOI lookups. + """ + return lookup_dois(dois_list=dois_list, rate_limit=rate_limit, timeout=timeout) - def add_dois(self, dois_list: list = [], rate_limit: float = 0.1, timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + def add_dois(self, dois_list: list = [], rate_limit: float = 0.05, timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Looks up a list of DOIs using the CrossRef API and adds to Review's results dataset. + + Parameters + ---------- + dois_list : list + list of DOIs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + old_len = len(self.results) self.results.add_dois(dois_list=dois_list, rate_limit=rate_limit, timeout=timeout) # type: ignore new_len = len(self.results) @@ -1981,7 +3275,33 @@ def add_dois(self, dois_list: list = [], rate_limit: float = 0.1, timeout = 60, return self - def from_dois(dois_list: list = [], rate_limit: float = 0.1, timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + def from_dois(dois_list: list = [], rate_limit: float = 0.05, timeout = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): # type: ignore + + """ + Looks up a list of DOIs using the CrossRef API and returns as a Review object. + + Parameters + ---------- + dois_list : list + list of DOIs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + review : Review + a Review object. + """ review = Review() review.add_dois(dois_list = dois_list, rate_limit=rate_limit, timeout = timeout, update_formatting = update_formatting, update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -1990,6 +3310,28 @@ def from_dois(dois_list: list = [], rate_limit: float = 0.1, timeout = 60, updat def update_from_dois(self, timeout: int = 60, update_formatting: bool = True, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Updates results entries that have DOIs associated using the CrossRef API. + + Parameters + ---------- + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_formatting : bool + whether to format author, funder, affiliations, and citations data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + has_doi = len(self.results.has('doi')) # type: ignore self.results.update_from_dois(timeout=timeout) # type: ignore @@ -2024,6 +3366,28 @@ def update_from_dois(self, timeout: int = 60, update_formatting: bool = True, up def sync_apis(self, timeout: int = 60, update_entities = False, drop_empty_rows = False, drop_duplicates = False): + """ + Updates data using all APIs: + * CrossRef (for DOI) + * Orcid + + Parameters + ---------- + timeout : int + how long in seconds to wait for results before raising an error. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + update_entities : bool + whether to update entity attributes. + + Returns + ------- + self : Review + a Review object. + """ + self.update_from_dois(timeout=timeout) self.update_from_orcid() self.format(update_entities=update_entities, drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) @@ -2031,12 +3395,67 @@ def sync_apis(self, timeout: int = 60, update_entities = False, drop_empty_rows return self def lookup_journal(self, issn = 'request_input', timeout = 60): + + """ + Looks up a journal by its ISSN using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + result : pandas.DataFrame + journal records. + """ + return lookup_journal(issn = issn, timeout = timeout) - def lookup_journals(self, issns_list: list = [], rate_limit: float = 0.1, timeout: int = 60): + def lookup_journals(self, issns_list: list = [], rate_limit: float = 0.05, timeout: int = 60): + + """ + Looks up a list of journal ISSNs using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + issns_list : str + list of ISSNs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + result : pandas.DataFrame + journal records. + """ + return lookup_journals(issns_list = issns_list, rate_limit = rate_limit, timeout = timeout) - def search_journals(self, *args, limit: int = None, rate_limit: float = 0.1, timeout = 60): # type: ignore + def search_journals(self, *args, limit: int = None, rate_limit: float = 0.05, timeout = 60): # type: ignore + + """ + Searches CrossRef API for journal records and returns the results as a Pandas DataFrame. + + Parameters + ---------- + *args + search fields. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + return search_journals(*args, limit = limit, rate_limit=rate_limit, timeout = timeout) def get_journal_entries(self, @@ -2045,9 +3464,35 @@ def get_journal_entries(self, select: list = None, # type: ignore sample: int = None, # type: ignore limit: int = None, # type: ignore - rate_limit: float = 0.1, + rate_limit: float = 0.05, timeout = 60): + """ + Looks up a journal using the CrossRef API and returns associated entries as a Pandas DataFrame. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + add_to_results : bool + whether to add results to Review. + filter : dict + select : list + + Returns + ------- + result : pandas.DataFrame + journal entry records. + """ + return get_journal_entries(issn = issn, filter = filter, select = select, sample = sample, limit = limit, rate_limit = rate_limit, timeout = timeout) def search_journal_entries( @@ -2069,10 +3514,62 @@ def search_journal_entries( select: list = None, # type: ignore sample: int = None, # type: ignore limit: int = None, # type: ignore - rate_limit: float = 0.1, + rate_limit: float = 0.05, timeout: int = 60, add_to_results: bool = False) -> pd.DataFrame: - + + """ + Searches for journal entries and articles associated with an ISSN using the CrossRef API. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + add_to_results : bool + whether to add search results to Review. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + df = search_journal_entries(issn = issn, bibliographic = bibliographic, title=title, @@ -2105,12 +3602,69 @@ def search_journal_entries( return df def lookup_funder(self, funder_id = 'request_input', timeout = 60): + + """ + Looks up a funder using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + result : pandas.DataFrame + funder records. + """ + return lookup_funder(funder_id = funder_id, timeout = timeout) - def lookup_funders(self, funder_ids: list = [], rate_limit: float = 0.1, timeout = 60): + def lookup_funders(self, funder_ids: list = [], rate_limit: float = 0.05, timeout = 60): + + """ + Looks up a list of funders using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + funder_ids : list + list of CrossRef Funder IDs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + + Returns + ------- + result : pandas.DataFrame + funder records. + """ + return lookup_funders(funder_ids=funder_ids, rate_limit=rate_limit, timeout = timeout) - def search_funders(self, *args, limit: int = None, rate_limit: float = 0.1, timeout = 60): # type: ignore + def search_funders(self, *args, limit: int = None, rate_limit: float = 0.05, timeout = 60): # type: ignore + + """ + Searches CrossRef API for funder records and returns the results as a Pandas DataFrame. + + Parameters + ---------- + *args + search fields. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + return search_funders(*args, limit=limit, rate_limit=rate_limit, timeout=timeout) def get_funder_works(self, @@ -2119,10 +3673,36 @@ def get_funder_works(self, select: list = None, # type: ignore sample: int = None, # type: ignore limit: int = None, # type: ignore - rate_limit: float = 0.1, + rate_limit: float = 0.05, timeout: int = 60, add_to_results: bool = False): + """ + Looks up a funder using the CrossRef API and returns associated publications as a Pandas DataFrame. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + add_to_results : bool + whether to add results to Review. + filter : dict + select : list + + Returns + ------- + result : pandas.DataFrame + publication records. + """ + df = get_funder_works(funder_id=funder_id, filter=filter, select=select, sample=sample, limit=limit, rate_limit=rate_limit, timeout=timeout) if add_to_results == True: @@ -2149,10 +3729,62 @@ def search_funder_works(self, select: list = None, # type: ignore sample: int = None, # type: ignore limit: int = None, # type: ignore - rate_limit: float = 0.1, + rate_limit: float = 0.05, timeout: int = 60, add_to_results: bool = False): - + + """ + Searches for publications associated with a funder using the CrossRef API. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + add_to_results : bool + whether to add search results to Review. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + df = search_funder_works( funder_id=funder_id, bibliographic=bibliographic, @@ -2184,6 +3816,22 @@ def search_funder_works(self, def search_orcid(self, query: str = 'request_input', add_to_authors: bool = True): + """ + Searches for author records using the Orcid API. + + Parameters + ---------- + query : str + query to search. Allows for keywords and Boolean logic. + add_to_authors : bool + whether to add results to Review's authors dataset. + + Returns + ------- + result : pandas.DataFrame + search result. + """ + if add_to_authors == True: self.activity_log.add_activity(type='API search', activity='searched ORCID for author and added to authors', location=['authors'], query=query) @@ -2224,6 +3872,87 @@ def api_search(self, wos = False, add_to_results = False): + """ + Searches multiple APIs and returns the results as a Pandas DataFrame. API options: + * CrossRef + * Scopus + * Web of Science (WoS) + + Parameters + ---------- + default_query : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + all_fields : str + Scopus only: searches all fields. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + year : str + searches for matching publication years. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_identifier : str + searches for API-specific author IDs (e.g. CrossRef, Scopus, WoS, Orcid). Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + publisher : str + searches for publisher names containing string. Defaults to None. + funder : str + searches for funder names containing string. Defaults to None. + abstract : str + searches for abstracts containing string. Defaults to None. + keywords : str + searches for matching keywords. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + isbn : str + searches for matching ISBNs. Defaults to None. + pubmed_id : str + searches for matching PubMed IDs (PMIDs). Defaults to None. + source_title : str + searches for sources with titles (e.g. journals, books) containing string. Defaults to None. + volume : str + searches for journal entries with matching volume numbers. Defaults to None. + page : str + searches for entries with matching page numbers. Defaults to None. + issue : str + searches for journal entries with matching issue numbers. Defaults to None. + language : str + searches for entries by language Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + references : str + searches for entries with citations that contain matching strings. Defaults to None. + topics : str + searches for entries tagged with matching topic names and keywords. Defaults to None. + default_operator : str + the default Boolean operator to build searches. Defaults to 'AND'. + limit_per_api : int + sets limits for the number of results to return per API. Used to limit impact on API servers. Defaults to 20. + rate_limit : float + CrossRef only: time delay in seconds per result. Used to limit impact on API servers. Defaults to 0.05 seconds. + timeout : int + CrossRef only: maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + crossref : bool + whether to search using the CrossRef API. + scopus : bool + whether to search using the Scopus API. + wos : bool + whether to search using the Web of Science (WoS) API. + add_to_results : bool + whether to add search results to Review. + + Returns + ------- + df : pandas.DataFrame + combined results from API searches. + """ + df = api_search(default_query = default_query, all_fields = all_fields, title = title, @@ -2282,6 +4011,29 @@ def api_search(self, def crawl_stored_citations(self, max_depth=3, processing_limit=1000, format = True, update_from_doi = False): + """ + Crawls outward from results' citations to identify new results *only using data already stored in the Review*. + + Parameters + ---------- + max_depth : int + the maximum crawl depth the crawler will reach before stopping. + Defaults to 3. + processing_limit : int + the maximum number of results the crawler will process before stopping. Defaults to 1000. + format : bool + whether to format results, authors, funders, and affiliations data. Defaults to True. + update_from_doi : bool + whether to use the CrossRef API to update entries that have DOIs associated. + + Notes + ----- + Operational details: + * crawl type: utilises a breadth-first crawl. + * crawl depth: the number of iterations the crawler performs. For each iteration, all results from the previous iteration are loaded as seeds to crawl from. + * operation: for each iteration, the crawler takes all citations in the current dataset and -- if they have not been crawled already -- adds any citations data they contain to the results. + """ + iteration = 1 processed_indexes = [] original_len = len(self.results) @@ -2353,8 +4105,8 @@ def crawl_stored_citations(self, max_depth=3, processing_limit=1000, format = Tr def crawl_citations( self, use_api: bool = True, - crawl_limit: int = 5, - depth_limit: int = 2, + crawl_limit: int = 1000, + depth_limit: int = 3, be_polite: bool = True, rate_limit: float = 0.05, timeout: int = 60, @@ -2364,43 +4116,45 @@ def crawl_citations( ): """ - Crawls a Result's object's entries, their citations, and so on. + Crawls all Results entries' citations to find new results. Returns a Pandas DataFrame. The crawler iterates through queue of works; extracts their citations; runs checks to validate each reference; based on these, selects a source to retrieve data from: - (a) if has a valid doi: Crossref API. - (b) if no valid doi: bespoke web scraping for specific academic websites. - (c) else if a link is present: general web scraping. - - Retrieves data and adds the entries to the dataframe. - - Iterates through each set of added entries. + * if has a valid doi: Crossref API. + * if no valid doi: bespoke web scraping for specific academic websites. + * else if a link is present: general web scraping. Parameters - ---------- - - - + ---------- + use_api : bool + whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True. + depth_limit : int + the maximum crawl depth the crawler will reach before stopping. Defaults to 3. + crawl_limit : int + the maximum number of results the crawler will process before stopping. Defaults to 1000. + be_polite : bool + whether to respect websites' crawler permissions, as set out by their robots.txt files. + rate_limit : float + time delay in seconds per result. Used to limit impact on API servers. Defaults to 0.05 seconds. + timeout : int + how long in seconds to wait for results before raising an error. Defaults to 60 seconds. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + Returns ------- - result : object - an object containing the results of a crawl. + result : pandas.DataFrame + the crawl results. """ - result = self.results.crawl_citations( - use_api = use_api, - crawl_limit = crawl_limit, - depth_limit = depth_limit, - be_polite = be_polite, - rate_limit = rate_limit, - timeout = timeout, - add_to_results = add_to_results - ) # type: ignore + data = self.results self.format_citations() result = citation_crawler( - data = self, # type: ignore + data = data, # type: ignore use_api = use_api, crawl_limit = crawl_limit, depth_limit = depth_limit, @@ -2419,28 +4173,38 @@ def crawl_citations( self.results.add_dataframe(df) # type: ignore self.format(drop_duplicates=drop_duplicates, drop_empty_rows=drop_empty_rows) - return result - def citations_dict(self, strip_ids = False): + def citations_dict(self) -> dict: + """ + Returns a dictionary containing Results entries and their citations. + * Keys: work_id + * Values: References object containing citations + """ + output = {} for i in self.results.index: data = self.results.loc[i] work_id = data['work_id'] - work_id_stripped = work_id.split('#')[0].strip() citations = data['citations'] if type(citations) == References: citations.update_work_ids() - output[work_id_stripped] = citations + output[work_id] = citations return output def author_works_dict(self) -> dict: + """ + Returns a dictionary containing Results entries and their associated authors. + * Keys: work_id + * Values: authors data as a list or dictionary + """ + output = {} for i in self.results.index: @@ -2458,6 +4222,12 @@ def author_works_dict(self) -> dict: def author_affiliations_dict(self) -> dict: + """ + Returns a dictionary containing Author entries and their associated affiliations. + * Keys: author_id + * Values: affiliations data as a list or dictionary + """ + output = {} auths = self.authors.summary.copy(deep=True) @@ -2477,6 +4247,12 @@ def author_affiliations_dict(self) -> dict: def funder_works_dict(self) -> dict: + """ + Returns a dictionary containing Results entries and their associated funders. + * Keys: work_id + * Values: funders data as a list or dictionary + """ + output = {} for i in self.results.index: @@ -2501,6 +4277,30 @@ def coauthors_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates a network representing co-authorship relationships. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + ignore_case : bool + whether to ignore the case of string data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing co-authorship relationships. + """ + if drop_empty_rows == True: self.authors.drop_empty_rows() @@ -2560,12 +4360,32 @@ def coauthors_network(self, def cofunders_network(self, format: bool = True, update_attrs: bool = True, - drop_duplicates = False, - drop_empty_rows = True, + # drop_duplicates = False, + # drop_empty_rows = True, ignore_case: bool = True, add_to_networks: bool = True ) -> Network: + """ + Generates a network representing co-funder relationships. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update funder attributes. + ignore_case : bool + whether to ignore the case of string data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing co-funder relationships. + """ + co_funders = self.get_cofunders(format=format, update_attrs=update_attrs, ignore_case=ignore_case) g = generate_funders_network(funders_dict=co_funders) @@ -2620,6 +4440,30 @@ def citation_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates a network representing citations between publications. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + ignore_case : bool + whether to ignore the case of string data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing citations. + """ + if drop_empty_rows == True: self.results.drop_empty_rows() # type: ignore @@ -2668,6 +4512,34 @@ def cocitation_network(self, add_citations_to_results=True, add_to_networks: bool = True): + """ + Generates a network representing instances of co-citations between publications. + + Parameters + ---------- + refresh_citations : bool + whether to re-generate the underlying citations network. Defaults to False. + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + ignore_case : bool + whether to ignore the case of string data. + add_citations_to_results : bool + whether to add Results entries' citations as Results entries. Defaults to True. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing co-citation relationships. + """ + if refresh_citations == True: citation_network = self.citation_network(format=format, @@ -2697,6 +4569,8 @@ def cocitation_network(self, if add_to_networks == True: self.activity_log.add_activity(type='network generation', activity=f'generated co-citations network and added to networks', location=['networks']) self.networks.__dict__['cocitations'] = network + + return network def bibcoupling_network(self, refresh_citations = False, @@ -2707,6 +4581,34 @@ def bibcoupling_network(self, add_citations_to_results=True, add_to_networks: bool = True): + """ + Generates a network representing bibliometric coupling between publications. + + Parameters + ---------- + refresh_citations : bool + whether to re-generate the underlying citations network. Defaults to False. + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + ignore_case : bool + whether to ignore the case of string data. + add_citations_to_results : bool + whether to add Results entries' citations as Results entries. Defaults to True. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing bibliometric coupling. + """ + if refresh_citations == True: citation_network = self.citation_network(format=format, @@ -2736,6 +4638,8 @@ def bibcoupling_network(self, self.activity_log.add_activity(type='network generation', activity=f'generated bibliometric coupling network and added to networks', location=['networks']) self.networks.__dict__['bibcoupling'] = network + return network + def author_works_network(self, format: bool = True, update_attrs: bool = True, @@ -2744,6 +4648,28 @@ def author_works_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates a bipartite network representing relationships between authors and publications. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing relationships between authors and publications. + """ + if drop_empty_rows == True: self.results.drop_empty_rows() # type: ignore self.authors.drop_empty_rows() # type: ignore @@ -2777,6 +4703,28 @@ def funder_works_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates a bipartite network representing relationships between funders and publications. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing relationships between funders and publications. + """ + if drop_empty_rows == True: self.results.drop_empty_rows() # type: ignore @@ -2809,6 +4757,28 @@ def author_affils_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates a bipartite network representing relationships between authors and affiliate organisations. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing relationships between authors and and affiliate organisations. + """ + if drop_empty_rows == True: self.authors.drop_empty_rows() @@ -2839,6 +4809,28 @@ def entities_network(self, add_to_networks: bool = True ) -> Network: + """ + Generates an n-partite network representing relationships between publications, authors, funders, and affiliate organisations. + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + network : Network + a network representing relationships between publications, authors, funders, and affiliate organisations. + """ + if drop_empty_rows == True: self.results.drop_empty_rows() # type: ignore self.authors.drop_empty_rows() # type: ignore @@ -2887,6 +4879,43 @@ def all_networks(self, add_to_networks: bool = True ) -> Networks: + """ + Generates all available networks: + * Citations + * Co-citations + * Bibliometric coupling + * Co-authors + * Co-funders + * Author-works (bipartite) + * Funder-works (bipartite) + * Author-affiliations (bipartite) + * Works, authors, funders and affiliations (n-partite) + + + Parameters + ---------- + format : bool + whether to format results, authors, funders, and affiliations data. + update_attrs : bool + whether to update author, funder, and affiliations attributes. + drop_duplicates : bool + whether to remove duplicated rows. + drop_empty_rows : bool + whether to remove rows which do not contain any data. + ignore_case : bool + whether to ignore the case of string data. + add_citations_to_results : bool + whether to add Results entries' citations as Results entries. Defaults to True. + add_to_networks : bool + whether to store the network in the Review's Networks attribute. + + Returns + ------- + networks : Networks + a Networks object containing all available networks. + """ + + if drop_empty_rows == True: self.results.drop_empty_rows() # type: ignore self.authors.drop_empty_rows() # type: ignore diff --git a/art/datasets/__pycache__/__init__.cpython-39.pyc b/art/datasets/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..97c2863 Binary files /dev/null and b/art/datasets/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/datasets/stopwords/__pycache__/stopwords.cpython-39.pyc b/art/datasets/stopwords/__pycache__/stopwords.cpython-39.pyc new file mode 100644 index 0000000..3757797 Binary files /dev/null and b/art/datasets/stopwords/__pycache__/stopwords.cpython-39.pyc differ diff --git a/art/exporters/__pycache__/__init__.cpython-39.pyc b/art/exporters/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..eed831f Binary files /dev/null and b/art/exporters/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/exporters/__pycache__/general_exporters.cpython-39.pyc b/art/exporters/__pycache__/general_exporters.cpython-39.pyc new file mode 100644 index 0000000..64717d6 Binary files /dev/null and b/art/exporters/__pycache__/general_exporters.cpython-39.pyc differ diff --git a/art/exporters/__pycache__/network_exporters.cpython-39.pyc b/art/exporters/__pycache__/network_exporters.cpython-39.pyc new file mode 100644 index 0000000..37a233e Binary files /dev/null and b/art/exporters/__pycache__/network_exporters.cpython-39.pyc differ diff --git a/art/exporters/general_exporters.py b/art/exporters/general_exporters.py index 41dc6c7..481a38a 100644 --- a/art/exporters/general_exporters.py +++ b/art/exporters/general_exporters.py @@ -15,7 +15,24 @@ def export_obj(obj, file_name = 'obj_name', folder_address: str = 'request_input', export_str_as: str = 'txt', export_dict_as: str = 'json', export_pandas_as: str = 'csv', export_network_as: str = 'graphML'): """ - Exports objects to external files based on their type. + Exports objects to external files based on their type. Detects the object's type and selects a corresponding file type. + + Parameters + ---------- + obj : object + the object to export. + file_name : str + name for export file. Defaults to requesting from user input. + folder_address : str + directory path for folder to export to. Defaults to requesting from user input. + export_str_as : str + file type for saving string objects. Defaults to 'txt', i.e. a .txt file. + export_dict_as : str + file type for saving dictionary objects. Defaults to 'json', i.e. a JSON file. + export_pandas_as : str + file type for saving Pandas objects (e.g. Series and DataFrames). Defaults to 'csv', i.e. a CSV file. + export_network_as : str + file type for saving network and graph objects (e.g. Network, iGraph Graph, NetworkX). Defaults to 'graphML'. """ # Checking object type @@ -113,6 +130,27 @@ def export_obj(obj, file_name = 'obj_name', folder_address: str = 'request_input def art_class_to_folder(obj, folder_name = 'request_input', folder_address: str = 'request_input', export_str_as: str = 'txt', export_dict_as: str = 'json', export_pandas_as: str = 'csv', export_network_as: str = 'graphML'): + """ + Specialised function to export ART classes to external folders. Detects the object's type and selects a corresponding file type. + + Parameters + ---------- + obj : object + the object to export. + folder_name : str + name for export folder. Defaults to requesting from user input. + folder_address : str + directory path for folder to export to. Defaults to requesting from user input. + export_str_as : str + file type for saving string objects. Defaults to 'txt', i.e. a .txt file. + export_dict_as : str + file type for saving dictionary objects. Defaults to 'json', i.e. a JSON file. + export_pandas_as : str + file type for saving Pandas objects (e.g. Series and DataFrames). Defaults to 'csv', i.e. a CSV file. + export_network_as : str + file type for saving network and graph objects (e.g. Network, iGraph Graph, NetworkX). Defaults to 'graphML'. + """ + obj_type_str = str(type(obj)) # If the object is None, no folder created @@ -231,6 +269,23 @@ def obj_to_folder(obj, folder_name = 'request_input', folder_address: str = 'req """ Exports objects as external folders. + + Parameters + ---------- + obj : object + the object to export. + folder_name : str + name for export folder. Defaults to requesting from user input. + folder_address : str + directory path for folder to export to. Defaults to requesting from user input. + export_str_as : str + file type for saving string objects. Defaults to 'txt', i.e. a .txt file. + export_dict_as : str + file type for saving dictionary objects. Defaults to 'json', i.e. a JSON file. + export_pandas_as : str + file type for saving Pandas objects (e.g. Series and DataFrames). Defaults to 'csv', i.e. a CSV file. + export_network_as : str + file type for saving network and graph objects (e.g. Network, iGraph Graph, NetworkX). Defaults to 'graphML'. """ obj_type = type(obj) diff --git a/art/exporters/network_exporters.py b/art/exporters/network_exporters.py index 64e6b3c..a035f5f 100644 --- a/art/exporters/network_exporters.py +++ b/art/exporters/network_exporters.py @@ -2,7 +2,6 @@ from typing import List, Dict, Tuple import json -import copy import igraph as ig # type: ignore from igraph import Graph # type: ignore from networkx.classes import Graph as NetworkX_Undir, DiGraph as NetworkX_Dir, MultiGraph as NetworkX_Multi # type: ignore @@ -107,6 +106,8 @@ def export_network(network: Graph, file_name: str = 'request_input', folder_addr ): network = Graph.from_networkx(network) + file_type = file_type.strip('.') + # Writing GraphML file by default or if selected if (file_type == None) or (file_type == '') or (file_type.lower() == 'graphml'): diff --git a/art/importers/__pycache__/__init__.cpython-39.pyc b/art/importers/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..9d830fa Binary files /dev/null and b/art/importers/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/importers/__pycache__/bibtex.cpython-39.pyc b/art/importers/__pycache__/bibtex.cpython-39.pyc new file mode 100644 index 0000000..29bdcb8 Binary files /dev/null and b/art/importers/__pycache__/bibtex.cpython-39.pyc differ diff --git a/art/importers/__pycache__/crossref.cpython-39.pyc b/art/importers/__pycache__/crossref.cpython-39.pyc new file mode 100644 index 0000000..a40bec6 Binary files /dev/null and b/art/importers/__pycache__/crossref.cpython-39.pyc differ diff --git a/art/importers/__pycache__/jstor.cpython-39.pyc b/art/importers/__pycache__/jstor.cpython-39.pyc new file mode 100644 index 0000000..1df95a1 Binary files /dev/null and b/art/importers/__pycache__/jstor.cpython-39.pyc differ diff --git a/art/importers/__pycache__/orcid.cpython-39.pyc b/art/importers/__pycache__/orcid.cpython-39.pyc new file mode 100644 index 0000000..a92c7ad Binary files /dev/null and b/art/importers/__pycache__/orcid.cpython-39.pyc differ diff --git a/art/importers/__pycache__/pdf.cpython-39.pyc b/art/importers/__pycache__/pdf.cpython-39.pyc new file mode 100644 index 0000000..eef2ca5 Binary files /dev/null and b/art/importers/__pycache__/pdf.cpython-39.pyc differ diff --git a/art/importers/__pycache__/scopus.cpython-39.pyc b/art/importers/__pycache__/scopus.cpython-39.pyc new file mode 100644 index 0000000..5930b7d Binary files /dev/null and b/art/importers/__pycache__/scopus.cpython-39.pyc differ diff --git a/art/importers/__pycache__/search.cpython-39.pyc b/art/importers/__pycache__/search.cpython-39.pyc new file mode 100644 index 0000000..e863191 Binary files /dev/null and b/art/importers/__pycache__/search.cpython-39.pyc differ diff --git a/art/importers/bibtex.py b/art/importers/bibtex.py index e4b792a..71a4aaa 100644 --- a/art/importers/bibtex.py +++ b/art/importers/bibtex.py @@ -6,6 +6,20 @@ def import_bibtex(file_path = 'request_input'): + """ + Reads Bibtex bibliography (.bib) file and returns as a Pandas DataFrame. + + Parameters + ---------- + file_path : str + directory address for .bib file to read. Defaults to requesting from user input. + + Returns + ------- + df : pandas.DataFrame + a Pandas DataFrame of the bibliographic data contained in the Bibtex file. + """ + if file_path == 'request_input': file_path = input('File path: ') diff --git a/art/importers/crossref.py b/art/importers/crossref.py index f8a59b3..b1b8cc6 100644 --- a/art/importers/crossref.py +++ b/art/importers/crossref.py @@ -90,10 +90,14 @@ 'updates' ] -my_etiquette = Etiquette('Academic Review Tool (ART)', '0.01', 'https://github.com/alan-turing-institute/academic_review_tool', 'academic_review_tool@outlook.com') +my_etiquette = Etiquette('Academic Review Tool (ART)', '1.10-beta', 'https://github.com/alan-turing-institute/academic_review_tool', 'academic_review_tool@outlook.com') def items_to_df(items: list) -> pd.DataFrame: + """ + Takes list containing items from CrossRef API call and returns as a Pandas DataFrame. + """ + global results_cols df = pd.DataFrame(columns = results_cols) @@ -221,6 +225,22 @@ def items_to_df(items: list) -> pd.DataFrame: def reference_to_df(reference: dict, update_from_doi = False) -> pd.DataFrame: + """ + Takes reference (i.e. citation) dictionary from CrossRef API result and returns as a Pandas DataFrame. + + Parameters + ---------- + reference : dict + a dictionary containing data on a reference associated with a CrossRef API result. + update_from_doi : bool + whether to update the reference data using the CrossRef API. Defaults to False. + + Returns + ------- + df : pandas.DataFrame + the reference formatted as a Pandas DataFrame. + """ + keys = list(reference.keys()) df_data = {} @@ -319,9 +339,24 @@ def reference_to_df(reference: dict, update_from_doi = False) -> pd.DataFrame: return df - def references_to_df(references_list: list, update_from_doi = False) -> pd.DataFrame: + """ + Takes a list of references (i.e. citations) from a CrossRef API result and returns as a Pandas DataFrame. + + Parameters + ---------- + references : list + a list containing data on references associated with a CrossRef API result. + update_from_doi : bool + whether to update the reference data using the CrossRef API. Defaults to False. + + Returns + ------- + df : pandas.DataFrame + the reference formatted as a Pandas DataFrame. + """ + df = pd.DataFrame(columns = results_cols, dtype=object) for i in references_list: @@ -338,9 +373,12 @@ def references_to_df(references_list: list, update_from_doi = False) -> pd.DataF return df - def operator_logic(default_operator: str, string: str): + """ + Takes CrossRef API search string, detects the logical operator used, and separates the operator and string. Returns a tuple. + """ + operator = default_operator if string.startswith('AND '): @@ -363,7 +401,6 @@ def operator_logic(default_operator: str, string: str): return (operator, string_stripped) - def query_builder(default_operator = 'AND', bibliographic: str = None, # type: ignore title: str = None, # type: ignore @@ -380,6 +417,46 @@ def query_builder(default_operator = 'AND', link: str = None, # type: ignore ): + """ + Takes queries for specific search fields and returns a string which is formatted for input into the CrossRef API. + + Parameters + ---------- + default_operator : str + default logical operator to build search. Defaults to 'AND'. + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + + Returns + ------- + query : str + a query formatted for input into the CrossRef API. + """ + query = '' if (bibliographic is not None) and (type(bibliographic) == str): # type: ignore @@ -446,7 +523,6 @@ def query_builder(default_operator = 'AND', return query - def search_works( bibliographic = None, # type: ignore title: str = None, # type: ignore @@ -469,6 +545,54 @@ def search_works( timeout = 60 ) -> pd.DataFrame: + """ + Searches CrossRef API for published works. Returns the results as a Pandas DataFrame. + + Parameters + ---------- + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ + if bibliographic == None: bibliographic = '' @@ -585,6 +709,22 @@ def search_works( def lookup_doi(doi = 'request_input', timeout = 60): + """ + Looks up DOI using the CrossRef API. + + Parameters + ---------- + doi : str + DOI to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + df : pandas.DataFrame + results from DOI lookup on CrossRef API. + """ + if doi == 'request_input': doi = input('doi: ') @@ -601,6 +741,24 @@ def lookup_doi(doi = 'request_input', timeout = 60): def lookup_dois(dois_list: list = [], rate_limit: float = 0.05, timeout = 60): + """ + Looks up a list of DOIs using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + dois_list : list + list of DOIs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + + Returns + ------- + result : pandas.DataFrame + result of DOI lookups. + """ + items = [] global my_etiquette @@ -616,6 +774,22 @@ def lookup_dois(dois_list: list = [], rate_limit: float = 0.05, timeout = 60): def lookup_journal(issn = 'request_input', timeout = 60): + """ + Looks up a journal by its ISSN using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + result : pandas.DataFrame + journal records. + """ + if issn == 'request_input': issn = input('Journal issn: ') @@ -627,6 +801,22 @@ def lookup_journal(issn = 'request_input', timeout = 60): def lookup_journals(issns_list: list = [], rate_limit: float = 0.05, timeout = 60): + """ + Looks up a list of journal ISSNs using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + issns_list : str + list of ISSNs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + output : pandas.DataFrame + journal records. + """ + global my_etiquette journals = Journals(etiquette=my_etiquette, timeout=timeout) @@ -642,6 +832,26 @@ def lookup_journals(issns_list: list = [], rate_limit: float = 0.05, timeout = 6 def search_journals(*args, limit: int = 1000, rate_limit: float = 0.05, timeout = 60): + """ + Searches CrossRef API for journal records and returns the results as a Pandas DataFrame. + + Parameters + ---------- + *args + search fields. + limit : int + optional: set a limit to the number of results returned. Defaults to 1000. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + results : pandas.DataFrame + results from CrossRef API search. + """ + global my_etiquette journals = Journals(etiquette=my_etiquette, timeout=timeout).query(*args) @@ -675,6 +885,30 @@ def get_journal_entries(issn = 'request_input', rate_limit: float = 0.05, timeout = 60): + """ + Looks up a journal using the CrossRef API and returns associated entries as a Pandas DataFrame. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + journal entry records. + """ + if issn == 'request_input': issn = input('Journal issn: ') @@ -749,6 +983,55 @@ def search_journal_entries(issn = 'request_input', rate_limit: float = 0.05, timeout = 60): + """ + Searches for journal entries and articles associated with an ISSN using the CrossRef API. + + Parameters + ---------- + issn : str + ISSN to look up. Defaults to requesting from user input. + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ if issn == 'request_input': issn = input('Journal issn: ') @@ -841,6 +1124,22 @@ def search_journal_entries(issn = 'request_input', def lookup_funder(funder_id = 'request_input', timeout = 60): + """ + Looks up a funder using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + output : pandas.DataFrame + funder records. + """ + if funder_id == 'request_input': funder_id = input('Funder ID: ') @@ -867,6 +1166,24 @@ def lookup_funder(funder_id = 'request_input', timeout = 60): def lookup_funders(funder_ids: list = [], rate_limit: float = 0.05, timeout = 60): + """ + Looks up a list of funders using the CrossRef API. Returns a Pandas DataFrame. + + Parameters + ---------- + funder_ids : list + list of CrossRef Funder IDs to look up. Defaults to an empty list. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + + Returns + ------- + output : pandas.DataFrame + funder records. + """ + global my_etiquette funders = Funders(etiquette=my_etiquette, timeout=timeout) @@ -900,6 +1217,26 @@ def lookup_funders(funder_ids: list = [], rate_limit: float = 0.05, timeout = 60 def search_funders(*args, limit: int = 1000, rate_limit: float = 0.05, timeout = 60): + """ + Searches CrossRef API for funder records and returns the results as a Pandas DataFrame. + + Parameters + ---------- + *args + search fields. + limit : int + optional: set a limit to the number of results returned. Defaults to 1000. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + output : pandas.DataFrame + results from CrossRef API search. + """ + global my_etiquette funders = Funders(etiquette=my_etiquette, timeout=timeout).query(*args) @@ -948,6 +1285,30 @@ def get_funder_works(funder_id = 'request_input', rate_limit: float = 0.05, timeout = 60): + """ + Looks up a funder using the CrossRef API and returns associated publications as a Pandas DataFrame. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + publication records. + """ + if funder_id == 'request_input': funder_id = input('Funder ID: ') @@ -1025,6 +1386,55 @@ def search_funder_works(funder_id = 'request_input', rate_limit: float = 0.05, timeout = 60): + """ + Searches for publications associated with a funder using the CrossRef API. + + Parameters + ---------- + funder_id : str + CrossRef Funder ID to look up. Defaults to requesting from user input. + bibliographic : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + published_date : str + searches for matching publication dates. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + publisher_name : str + searches for publisher names containing string. Defaults to None. + funder_name : str + searches for funder names containing string. Defaults to None. + source : str + searches for sources (e.g. journals, books) containing string. Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + sample : int + optional: select which results to return. + limit : int + optional: set a limit to the number of results returned. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + filter : dict + select : list + + Returns + ------- + df : pandas.DataFrame + results from CrossRef API search. + """ if funder_id == 'request_input': funder_id = input('Funder ID: ') diff --git a/art/importers/jstor.py b/art/importers/jstor.py index c8c6c6f..6085401 100644 --- a/art/importers/jstor.py +++ b/art/importers/jstor.py @@ -9,12 +9,31 @@ import numpy as np def access_jstor_database(): + + """ + Opens the JSTOR Constellate website in the default web browser. + """ + return webbrowser.open('https://constellate.org/builder') def import_metadata(file_path = 'request_input', # clean_results = True ): + """ + Reads a metadata CSV file outputted by JSTOR's Constellate portal and returns a Pandas DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + output_df : pandas.DataFrame + a Pandas DataFrame containing JSTOR metadata. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -40,8 +59,6 @@ def import_metadata(file_path = 'request_input', if c not in results_cols: to_drop.append(c) - - output_df = import_df.drop(labels=to_drop, axis='columns') output_df['authors'] = output_df['authors'].str.lower().str.split(';') output_df['authors_data'] = output_df['authors'].copy(deep=True) @@ -69,6 +86,20 @@ def import_full(file_path = 'request_input', # clean_results = True ): + """ + Reads a JSON file outputted by JSTOR's Constellate portal and returns a Pandas DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + output_df : pandas.DataFrame + a Pandas DataFrame containing JSTOR data. + """ + if file_path == 'request_input': file_path = input('File path: ') @@ -120,6 +151,26 @@ def import_full(file_path = 'request_input', def import_jstor(file_path = 'request_input') -> pd.DataFrame: + """ + Reads a file outputted by JSTOR's Constellate portal and returns as a Pandas DataFrame. + + Parameters + ---------- + file_path : str + directory path of file to import. Defaults to requesting from user input. + + Returns + ------- + result : pandas.DataFrame. + a Pandas DataFrame containing JSTOR data. + + Notes + ----- + Can read: + * .csv + * .json + """ + if file_path == 'request_input': file_path = input('File path: ') diff --git a/art/importers/orcid.py b/art/importers/orcid.py index ce1099f..4ddd851 100644 --- a/art/importers/orcid.py +++ b/art/importers/orcid.py @@ -23,6 +23,20 @@ def lookup_orcid(orcid_id = 'request_input'): + """ + Looks up an ORCID ID and returns a Pandas DataFrame of potential matches. + + Parameters + ---------- + orcid_id : str + an ORCID ID to look up. Defaults to requesting from user input. + + Results + ------- + df : pandas.DataFrame + a Pandas DataFrame of potential matches in the ORCID database. + """ + if orcid_id == 'request_input': orcid_id = input('ORCID ID: ') @@ -43,6 +57,10 @@ def lookup_orcid(orcid_id = 'request_input'): def get_author(orcid_id = 'request_input'): + """ + Retrieves an ORCID account using an ORCID ID. Returns a Pyorcid Orcid object. + """ + if orcid_id == 'request_input': orcid_id = input('ORCID ID: ') @@ -53,6 +71,22 @@ def get_author(orcid_id = 'request_input'): def get_author_works(orcid_id = 'request_input', output = 'dataframe'): + """ + Retrieves data on an ORCID profile's listed works. + + Parameters + ---------- + orcid_id : str + an ORCID ID to look up. Defaults to requesting from user input. + output : str + the type of object to return. Defaults to 'dataframe' (a Pandas DataFrame). + + Returns + ------- + works_list : pandas.DataFrame or list or tuple + an object containing data on the ORCID profile's listed works. + """ + if orcid_id == 'request_input': orcid_id = input('ORCID ID: ') @@ -74,6 +108,26 @@ def get_author_works(orcid_id = 'request_input', output = 'dataframe'): def search(query: str = 'request_input', start: int = 0, limit: int = 1000, output: str = 'dataframe'): + """ + Searches for author records using the Orcid API. + + Parameters + ---------- + query : str + query to search. Allows for keywords and Boolean logic. + start : int + index position of first result to return. Defaults to 0. + limit : int + the maximum number of results returned. Defaults to 1000. + output : str + the type of object to return. Defaults to 'dataframe' (a Pandas DataFrame). + + Returns + ------- + results_list : pandas.DataFrame or list or tuple + an object containing search results. + """ + if query == 'request_input': query = input('Search query: ') @@ -116,11 +170,6 @@ def search(query: str = 'request_input', start: int = 0, limit: int = 1000, outp if (output == pd.DataFrame) or (output.lower().strip() == 'dataframe'): return pd.DataFrame(results_list) - - - - - def save_summary(self: Orcid, file_name: str = 'request_input', file_path: str = 'request_input'): if file_name == 'request_input': diff --git a/art/importers/pdf.py b/art/importers/pdf.py index 72f7810..4478fab 100644 --- a/art/importers/pdf.py +++ b/art/importers/pdf.py @@ -15,7 +15,6 @@ from pypdf import PdfReader from nltk.tokenize import word_tokenize, sent_tokenize - def pdf_to_dict(file_path = None): """ @@ -52,7 +51,6 @@ def pdf_to_dict(file_path = None): return output_dict - def pdf_url_to_dict(url = None): """ @@ -94,9 +92,12 @@ def pdf_url_to_dict(url = None): return output_dict - def parse_pdf_text(input_data): + """ + Parses text from PDF reader result. + """ + if type(input_data) == str: text = input_data else: @@ -148,9 +149,12 @@ def parse_pdf_text(input_data): return output - def parse_pdf_doi(input_data): + """ + Parses DOI from PDF reader result. + """ + if type(input_data) == str: text = input_data metadata = {} @@ -196,7 +200,6 @@ def parse_pdf_doi(input_data): return text_split[0] - def parse_pdf_links(input_data): """ @@ -241,11 +244,10 @@ def parse_pdf_links(input_data): return result - def parse_pdf_authors(input_data): """ - Identifies author details from PDF reader result. + Parses author details from PDF reader result. """ # Checking type of input data and defining variables for parsing @@ -413,8 +415,6 @@ def parse_pdf_date(pdf_dict): return date - - def parse_pdf_title(input_data): """ @@ -517,9 +517,12 @@ def parse_pdf_title(input_data): return output - def parse_pdf_reader_dict(pdf_dict): + """ + Parses dictionaries outputted by the PDF reader. Returns a dictionary containing parsed and formatted data. + """ + try: pdf_dict['title'] = parse_pdf_title(pdf_dict) except: @@ -548,14 +551,14 @@ def parse_pdf_reader_dict(pdf_dict): try: pdf_dict['text'] = parse_pdf_text(pdf_dict) except: - pdf_dict['text'] = Nonee + pdf_dict['text'] = None return pdf_dict def read_pdf(file_path: str = None) -> dict: """ - Loads and parses PDF file. Returns a dictionary of data. + Reads and parses PDF file. Returns a dictionary of parsed and formatted data. """ # Retrieving PDF date @@ -567,7 +570,6 @@ def read_pdf(file_path: str = None) -> dict: return output - def read_pdf_url(url = None): """ @@ -585,6 +587,10 @@ def read_pdf_url(url = None): def select_pdf_reader(file_path): + """ + Detects whether a file path directs to a valid internal address or URL, and reads the PDF using the appropriate reader function. + """ + path = Path(file_path) if path.exists() == True: @@ -595,6 +601,10 @@ def select_pdf_reader(file_path): def gen_pdf_id(pdf_dict): + """ + Generates a unique identifier from a PDF reader result. + """ + if (pdf_dict['title'] != None) and (pdf_dict['title'] != ''): pdf_id = pdf_dict['title'].replace(' ', '').replace('/', '').replace('.', '').replace('?', '').replace('-', '').replace('_', '').replace('!', '').lower().strip() rand_int = (random.randint(0,9)*len(pdf_id))+(random.randint(0,9)+len(pdf_id)) @@ -614,10 +624,12 @@ def gen_pdf_id(pdf_dict): return pdf_id - - def read_pdf_to_table(file_path = None): + """ + Reads and parses PDF file. Returns a Pandas DataFrame of parsed and formatted data. + """ + if file_path == None: file_path = input('File path: ') diff --git a/art/importers/scopus.py b/art/importers/scopus.py index 4460c12..3c0e07f 100644 --- a/art/importers/scopus.py +++ b/art/importers/scopus.py @@ -13,9 +13,12 @@ from pybliometrics.scopus import AbstractRetrieval, ScopusSearch # type: ignore - def operator_logic(default_operator: str, string: str): + """ + Takes Scopus API search string, detects the logical operator used, and separates the operator and string. Returns a tuple. + """ + string = string.replace('NOT ', 'AND NOT ').replace('AND AND NOT ', 'AND NOT ') operator = default_operator @@ -67,6 +70,68 @@ def query_builder(default_operator = 'AND', tile_abs_key_auth = None ): + """ + Takes queries for specific search fields and returns a string which is formatted for input into the Scopus API. + + Parameters + ---------- + default_operator : str + default logical operator to build search. Defaults to 'AND'. + tile_abs_key_auth : str + a combined search. Searches for titles, abstracts, keywords, and author names. Defaults to None. + all_fields : str + searches all fields. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + year : str + searches for matching publication years. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_identifier : str + searches for Scopus author IDs. Defaults to None. + affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + publisher : str + searches for publisher names containing string. Defaults to None. + funder : str + searches for funder names containing string. Defaults to None. + abstract : str + searches for abstracts containing string. Defaults to None. + keywords : str + searches for matching keywords. Defaults to None. + doctype : str + searches for types of entries containing string. Defaults to None. + doi : str + searches for matching DOIs. Defaults to None. + issn : str + searches for matching ISSNs. Defaults to None. + isbn : str + searches for matching ISBNs. Defaults to None. + pubmed_id : str + searches for matching PubMed IDs (PMIDs). Defaults to None. + source_title : str + searches for source titles (e.g. journals, books) containing string. Defaults to None. + volume : str + searches for journal entries with matching volume numbers. Defaults to None. + page : str + searches for entries with matching page numbers. Defaults to None. + issue : str + searches for journal entries with matching issue numbers. Defaults to None. + language : str + searches for entries by language Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + references : str + searches for entries with citations that contain matching strings. Defaults to None. + + Returns + ------- + query : str + a query formatted for input into the Scopus API. + """ + query = '' if (all_fields is not None) and (type(all_fields) == str): # type: ignore @@ -175,7 +240,6 @@ def query_builder(default_operator = 'AND', return query - def search(tile_abs_key_auth = None, all_fields = None, title = None, @@ -201,14 +265,91 @@ def search(tile_abs_key_auth = None, link = None, references = None, default_operator = 'AND', - refresh=True, - view=None, - verbose=False, - download=True, - integrity_fields=None, - integrity_action='raise', - subscriber=False): - + refresh=True, + view=None, + verbose=False, + download=True, + integrity_fields=None, + integrity_action='raise', + subscriber=False): + + """ + Searches Scopus API and returns the results as a Pandas DataFrame. + + Parameters + ---------- + tile_abs_key_auth : str + a combined search. Searches for titles, abstracts, keywords, and author names. Defaults to None. + all_fields : str + searches all fields. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + year : str + searches for matching publication years. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_identifier : str + searches for Scopus author IDs. Defaults to None. + affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + publisher : str + searches for publisher names containing string. Defaults to None. + funder : str + searches for funder names containing string. Defaults to None. + abstract : str + searches for abstracts containing string. Defaults to None. + keywords : str + searches for matching keywords. Defaults to None. + doctype : str + searches for types of entries containing string. Defaults to None. + doi : str + searches for matching DOIs. Defaults to None. + issn : str + searches for matching ISSNs. Defaults to None. + isbn : str + searches for matching ISBNs. Defaults to None. + pubmed_id : str + searches for matching PubMed IDs (PMIDs). Defaults to None. + source_title : str + searches for source titles (e.g. journals, books) containing string. Defaults to None. + volume : str + searches for journal entries with matching volume numbers. Defaults to None. + page : str + searches for entries with matching page numbers. Defaults to None. + issue : str + searches for journal entries with matching issue numbers. Defaults to None. + language : str + searches for entries by language Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + references : str + searches for entries with citations that contain matching strings. Defaults to None. + default_operator : str + the default Boolean operator to build the search. Defaults to 'AND'. + refresh : bool + view : bool + verbose : bool + download : bool + integrity_fields : None + integrity_action : str + subscriber : bool + + Returns + ------- + df : pandas.DataFrame + results from Scopus API search. + + Options + ------- + Options for default_operator: + * 'AND' + * 'AND NOT' + * 'NOT' + * 'OR' + """ + query = query_builder(default_operator = default_operator, all_fields = all_fields, title = title, @@ -286,6 +427,31 @@ def lookup(uid: str = 'request_input', view = 'META', id_type = None): + """ + Looks up publication using the Scopus API. + + Parameters + ---------- + uid : str + Scopus ID, DOI, ISBN, ISSN, or Pubmed ID (PMID) to look up. Defaults to requesting from user input. + refresh : bool + whether to refresh the Scopus session. + view : str + sets the amount of detail returned. Defaults to 'META'. + add_to_results : bool + whether to add results to Review. + drop_duplicates : bool + whether to remove duplicated rows when adding to results. + drop_empty_rows : bool + whether to remove rows which do not contain any data when adding to results. + id_type : None + + Returns + ------- + df : pandas.DataFrame + results from publication lookup on Scopus API. + """ + if uid == 'request_input': uid = input('ID: ') diff --git a/art/importers/search.py b/art/importers/search.py index c42eb59..40978f1 100644 --- a/art/importers/search.py +++ b/art/importers/search.py @@ -41,6 +41,88 @@ def search(default_query = None, orcid = False ): + """ + Searches multiple APIs and returns the results as a Pandas DataFrame. API options: + * CrossRef + * Scopus + * Web of Science (WoS) + * ORCID + + Parameters + ---------- + default_query : str + a combined search. Searches for titles, abstracts, authors, publishers, dates etc. Defaults to None. + all_fields : str + Scopus only: searches all fields. Defaults to None. + title : str + searches for titles containing string. Defaults to None. + year : str + searches for matching publication years. Defaults to None. + author : str + searches for authors containing string. Defaults to None. + author_identifier : str + searches for API-specific author IDs (e.g. CrossRef, Scopus, WoS, Orcid). Defaults to None. + entry_type : str + searches for types of entries containing string. Defaults to None. + affiliation : str + searches for author affiliations containing string. Defaults to None. + editor : str + searches for editor names containing string. Defaults to None. + publisher : str + searches for publisher names containing string. Defaults to None. + funder : str + searches for funder names containing string. Defaults to None. + abstract : str + searches for abstracts containing string. Defaults to None. + keywords : str + searches for matching keywords. Defaults to None. + doi : str + searches for matching DOIs. + issn : str + searches for matching ISSNs. + isbn : str + searches for matching ISBNs. Defaults to None. + pubmed_id : str + searches for matching PubMed IDs (PMIDs). Defaults to None. + source_title : str + searches for sources with titles (e.g. journals, books) containing string. Defaults to None. + volume : str + searches for journal entries with matching volume numbers. Defaults to None. + page : str + searches for entries with matching page numbers. Defaults to None. + issue : str + searches for journal entries with matching issue numbers. Defaults to None. + language : str + searches for entries by language Defaults to None. + link : str + searches for entry links containing string. Defaults to None. + references : str + searches for entries with citations that contain matching strings. Defaults to None. + topics : str + searches for entries tagged with matching topic names and keywords. Defaults to None. + default_operator : str + the default Boolean operator to build searches. Defaults to 'AND'. + limit_per_api : int + sets limits for the number of results to return per API. Used to limit impact on API servers. Defaults to 20. + rate_limit : float + CrossRef only: time delay in seconds per result. Used to limit impact on API servers. Defaults to 0.05 seconds. + timeout : int + CrossRef only: maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + crossref : bool + whether to search using the CrossRef API. Defaults to True. + scopus : bool + whether to search using the Scopus API. Defaults to True. + wos : bool + whether to search using the Web of Science (WoS) API. Defaults to False. + orcid : bool + whether to search using the ORCID API. Defaults to False. + + Returns + ------- + df : pandas.DataFrame + combined results from API searches. + """ + df = pd.DataFrame(dtype=object) if crossref == True: diff --git a/art/importers/wos.py b/art/importers/wos.py index 7e580b5..f1ebbe5 100644 --- a/art/importers/wos.py +++ b/art/importers/wos.py @@ -33,6 +33,10 @@ def extract_source(source_dict): + """ + Extracts publication source from Web of Science API result. + """ + source = None if (type(source_dict) == list) and (len(source_dict)>0): @@ -47,6 +51,10 @@ def extract_source(source_dict): def extract_cite_counts(citations_dict): + """ + Extracts citation counts from Web of Science API result. + """ + count = None if (type(citations_dict) == list) and (len(citations_dict)>0): @@ -61,6 +69,10 @@ def extract_cite_counts(citations_dict): def extract_isbn(identifiers): + """ + Extracts publication ISBN from Web of Science API result. + """ + isbn = None if (type(identifiers) == list) and (len(identifiers)>0): @@ -75,6 +87,10 @@ def extract_isbn(identifiers): def extract_issn(identifiers): + """ + Extracts publication ISSN from Web of Science API result. + """ + issn = None if (type(identifiers) == list) and (len(identifiers)>0): @@ -89,6 +105,10 @@ def extract_issn(identifiers): def extract_doi(identifiers): + """ + Extracts publication DOI from Web of Science API result. + """ + doi = None if (type(identifiers) == list) and (len(identifiers)>0): @@ -103,6 +123,10 @@ def extract_doi(identifiers): def extract_keywords(keywords_dict): + """ + Extracts publication keywords from Web of Science API result. + """ + kws = None if (type(keywords_dict) == list) and (len(keywords_dict)>0): @@ -117,6 +141,10 @@ def extract_keywords(keywords_dict): def extract_related(links_dict): + """ + Extracts related publications from Web of Science API result. + """ + link = None if (type(links_dict) == list) and (len(links_dict)>0): @@ -131,6 +159,10 @@ def extract_related(links_dict): def extract_links(links_dict): + """ + Extracts links from Web of Science API result. + """ + link = None if (type(links_dict) == list) and (len(links_dict)>0): @@ -150,10 +182,12 @@ def extract_links(links_dict): return link - - def operator_logic(default_operator: str, string: str): + """ + Takes Web of Science API search string, detects the logical operator used, and separates the operator and string. Returns a tuple. + """ + operator = default_operator if string.startswith('AND '): @@ -202,6 +236,36 @@ def query_builder(default_operator = 'AND', topics = None ): + """ + Takes queries for specific search fields and returns a string which is formatted for input into the Web of Science API. + + Parameters + ---------- + default_operator : str + default logical operator to build search. Defaults to 'AND'. + all_fields : str + title : str + year : str + author : str + author_identifier : str + affiliation : str + doctype : str + doi : str + issn : str + isbn : str + pubmed_id : str + source_title : str + volume : str + page : str + issue : str + topics : str + + Returns + ------- + query : str + a query formatted for input into the Web of Science API. + """ + query = '' if (all_fields is not None) and (type(all_fields) == str): # type: ignore @@ -285,6 +349,27 @@ def search_engine(query: str = 'request_input', detail = None ): + """ + Core functionality for making Web of Science publication search API calls. + + Parameters + ---------- + query: str + a query formatted for input into the Web of Science API. + database : str + limit : int + page : int + sort_field : str + modified_time_span + tc_modified_time_span + detail + + Returns + ------- + api_response : DocumentsList + Web of Science API response. + """ + if query == 'request_input': query = input('Search query: ') @@ -329,6 +414,43 @@ def search( detail = None ): + """ + Searches Web of Science API for published works. Returns the results as a Pandas DataFrame. + + Parameters + ---------- + all_fields : str + title : str + year : str + author : str + author_identifier : str + affiliation : str + doctype : str + doi : str + issn : str + isbn : str + pubmed_id : str + source_title : str + volume : str + page : str + issue : str + topics : str + default_operator : str + default logical operator to build search. Defaults to 'AND'. + database : str + limit : int + page : int + sort_field : str + modified_time_span + tc_modified_time_span + detail + + Returns + ------- + df : pandas.DataFrame + results from Web of Science API search. + """ + query = query_builder(default_operator = default_operator, all_fields = all_fields, title = title, @@ -403,6 +525,20 @@ def search( def journals_search_engine(issn: str = 'request_input'): + """ + Core functionality for making Web of Science journal search API calls. + + Parameters + ---------- + issn: str + an ISSN to search. + + Returns + ------- + api_response : JournalsList + Web of Science API response. + """ + if issn == 'request_input': issn = input('ISSN to search: ') @@ -424,6 +560,19 @@ def search_journals( query = 'request_input' ): + """ + Searches Web of Science API for journals. Returns the results as a Pandas DataFrame. + + Parameters + ---------- + query : str + search query. + + Returns + ------- + df : pandas.DataFrame + results from Web of Science API search. + """ api_response = journals_search_engine(issn=query) diff --git a/art/internet/__pycache__/__init__.cpython-39.pyc b/art/internet/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..366584c Binary files /dev/null and b/art/internet/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/internet/__pycache__/crawlers.cpython-39.pyc b/art/internet/__pycache__/crawlers.cpython-39.pyc new file mode 100644 index 0000000..1bb12f2 Binary files /dev/null and b/art/internet/__pycache__/crawlers.cpython-39.pyc differ diff --git a/art/internet/__pycache__/scrapers.cpython-39.pyc b/art/internet/__pycache__/scrapers.cpython-39.pyc new file mode 100644 index 0000000..94e8457 Binary files /dev/null and b/art/internet/__pycache__/scrapers.cpython-39.pyc differ diff --git a/art/internet/__pycache__/search.cpython-39.pyc b/art/internet/__pycache__/search.cpython-39.pyc new file mode 100644 index 0000000..3af3fe2 Binary files /dev/null and b/art/internet/__pycache__/search.cpython-39.pyc differ diff --git a/art/internet/__pycache__/webanalysis.cpython-39.pyc b/art/internet/__pycache__/webanalysis.cpython-39.pyc new file mode 100644 index 0000000..2edc80e Binary files /dev/null and b/art/internet/__pycache__/webanalysis.cpython-39.pyc differ diff --git a/art/internet/crawlers.py b/art/internet/crawlers.py index f1d4e68..b3365ba 100644 --- a/art/internet/crawlers.py +++ b/art/internet/crawlers.py @@ -63,7 +63,6 @@ def is_external_link(source_url: str = 'request_input', linked_url: str = 'reque # Checking if link is external return is_external(source_url, linked_url, ignore_suffix=ignore_suffix) - def fetch_feed_urls(url: str = 'request_input') -> list: """ @@ -132,7 +131,7 @@ def fetch_url_rules(url = 'request_input'): Returns ------- - result : object + result : RobotFileParser Robots.txt ruleset. """ @@ -290,7 +289,8 @@ def crawl_site(url: str = 'request_input', max_seen_urls: int = 10, max_known_ur def correct_link_errors(url: str, source_domain = None) -> str: - """Checks for errors in a link and corrects them. Returns a corrected link as a string. + """ + Checks for errors in a link and corrects them. Returns a corrected link as a string. Parameters ---------- @@ -302,7 +302,8 @@ def correct_link_errors(url: str, source_domain = None) -> str: Returns ------- url : str - a corrected URL.""" + a corrected URL. + """ # Source domain defaults to None. Changing this to an empty string for type handling. if source_domain == None: @@ -522,7 +523,6 @@ def excluded_term_test(current_url: str, excluded_url_terms: list, case_sensitiv if (term in url_check) == True: return True - def required_keywords_test(text, required_keywords, case_sensitive): """ @@ -843,7 +843,6 @@ def clean_seed_urls(seed_urls: list) -> list: return cleaned_seeds - def crawler( seed_urls: str = 'request_input', visit_limit: int = 5, @@ -1161,6 +1160,38 @@ def crawl_google_scholar( case_sensitive = False, ): + """ + Crawls from a Google Scholar search. Returns details like links found, HTML scraped, and site metadata in a Pandas DataFrame. + + Parameters + ---------- + query : str + query to search Google Scholar. Defaults to requesting from user input. + page_limit : int + maximum number of Google Scholar results pages to scrape. Defaults to 20. + by_citations : bool + whether to crawl citations. Defaults to True. + by_recommended : bool + whether to crawl Google Scholar recommendation links. Defaults to True. + crawl_depth : int + the maximum crawl depth the crawler will reach before stopping. Defaults to 3. + crawl_limit : int + the maximum number of websites the crawler will visit before stopping. Defaults to 100. + discovery_limit : int + the maximum number of results the crawler will discover before stopping. Defaults to 1000. + select_keywords : list + list of keywords which sites must contain to be crawled. + exclude_keywords : list + list of keywords which sites must *not* contain to be crawled. + case_sensitive : bool + whether or not to ignore string characters' case. + + Returns + ------- + df : pandas.DataFrame + a Pandas DataFrame containing the results of a crawl. + """ + if query == 'request_input': query = input('Search query: ') @@ -1280,7 +1311,6 @@ def crawl_google_scholar( return df - def network_from_crawl_result(crawl_df: pd.DataFrame) -> Graph: """ @@ -1806,7 +1836,6 @@ def site_similarity_network(comparisons_df: pd.DataFrame, measure: str = 'html_d return network - def similarity_network_from_crawl_result(crawl_df: pd.DataFrame, measure: str = 'html_distance') -> Graph: """ @@ -1973,7 +2002,6 @@ def similarity_network_from_crawl( return network - def network_from_search_crawl( query: str = 'request_input', visit_limit: int = 5, @@ -2040,8 +2068,6 @@ def network_from_search_crawl( return network - - def similarity_network_from_search_crawl( query: str = 'request_input', measure: str = 'html_distance', diff --git a/art/internet/scrapers.py b/art/internet/scrapers.py index 33c31ad..1023e72 100644 --- a/art/internet/scrapers.py +++ b/art/internet/scrapers.py @@ -50,6 +50,10 @@ def get_final_url(url): + """ + Follows URL redirects and returns the final URL destination. + """ + global headers req = Request(url=url, headers=headers) @@ -57,12 +61,27 @@ def get_final_url(url): return resp.geturl() def bs_find(tag, content, soup): + + """ + Easy-to-use function for using BeautifulSoup soup.find method. Returns a tuple containing the code and result. + """ + return ('soup.find(attrs={"'+tag+'":"'+content + '"})', soup.find(attrs={tag:content})) def bs_find_all(tag, content, soup): + + """ + Easy-to-use function for using BeautifulSoup soup.find_all method. Returns a tuple containing the code and result. + """ + return ('soup.find_all(attrs={"'+tag+'":"'+content + '"})', soup.find_all(attrs={tag:content})) def bs_name_content(content_tag, soup): + + """ + Easyto-use function for using BeautifulSoup soup.find method to identify name content. Returns a tuple containing the code and result. + """ + return ('soup.find(attrs={"name":"'+content_tag + '"}).attrs["content"]', soup.find(attrs={'name':content_tag}).attrs['content']) def get_url_source(url = 'request_input'): @@ -99,6 +118,10 @@ def get_url_source(url = 'request_input'): def url_to_soup(url = 'request_input'): + """ + Takes URL, scapes the site, and returns as a BeautifulSoup object. + """ + if url == 'request_input': url = input('URL: ') @@ -767,6 +790,10 @@ def crawler_scraper(current_url: str, full: bool) -> tuple: def scrape_frontiers(url): + """ + Bespoke web scraper to scrape and parse Frontiers article webpages. Takes a Frontiers URL and returns a Pandas DataFrame. + """ + if ('frontiersin.org' not in url) and ('doi.org' not in url): raise ValueError('URL must be for a Frontiers webpage') @@ -881,6 +908,10 @@ def scrape_frontiers(url): def scrape_arxiv(url): + """ + Bespoke web scraper to scrape and parse ArXiv article webpages. Takes an ArXiv URL and returns a Pandas DataFrame. + """ + if ('arxiv.org' not in url) and ('doi.org' not in url): raise ValueError('URL must be for a Arxiv webpage') @@ -984,6 +1015,10 @@ def scrape_arxiv(url): def scrape_springer(url): + """ + Bespoke web scraper to scrape and parse Springer article webpages. Takes a Springer URL and returns a Pandas DataFrame. + """ + if ('springer' not in url) and ('doi.org' not in url): raise ValueError('URL must be for a Springer webpage') @@ -1145,6 +1180,10 @@ def scrape_springer(url): def scrape_nature(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse Nature article webpages. Takes a Nature URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -1245,6 +1284,10 @@ def scrape_nature(url = 'request_input'): def scrape_ieee(url): + """ + Bespoke web scraper to scrape and parse IEEE article webpages. Takes an IEEE URL and returns a Pandas DataFrame. + """ + if type(url) !=str: raise TypeError('URL must be a string') @@ -1357,6 +1400,10 @@ def scrape_ieee(url): def scrape_pubmed(url): + """ + Bespoke web scraper to scrape and parse PubMed article webpages. Takes a PubMed URL and returns a Pandas DataFrame. + """ + if type(url) !=str: raise TypeError('URL must be a string') @@ -1457,6 +1504,10 @@ def scrape_pubmed(url): def scrape_pmc(url): + """ + Bespoke web scraper to scrape and parse PMC article webpages. Takes a PMC URL and returns a Pandas DataFrame. + """ + if type(url) !=str: raise TypeError('URL must be a string') @@ -1570,6 +1621,10 @@ def scrape_pmc(url): def scrape_ssrn(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse SSRN article webpages. Takes an SSRN URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -1652,6 +1707,10 @@ def scrape_ssrn(url = 'request_input'): def scrape_heinonline(url): + """ + Bespoke web scraper to scrape and parse HeinOnline article webpages. Takes a HeinOnline URL and returns a Pandas DataFrame. + """ + if type(url) !=str: raise TypeError('URL must be a string') @@ -1721,6 +1780,10 @@ def scrape_heinonline(url): def scrape_mdpi(url): + """ + Bespoke web scraper to scrape and parse MDPI article webpages. Takes an MDPI URL and returns a Pandas DataFrame. + """ + if type(url) !=str: raise TypeError('URL must be a string') @@ -1804,6 +1867,10 @@ def scrape_mdpi(url): def scrape_acm(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse ACM article webpages. Takes an ACM URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -1897,6 +1964,22 @@ def scrape_acm(url = 'request_input'): def parse_muse_from_source(source = 'request_input', link = None): + """ + Parses source HTML from a Project MUSE article webpage and returns a Pandas DataFrame. + + Parameters + ---------- + source : str + source HTML. + link : str + link to webpage. + + Returns + ------- + result : pandas.DataFrame + a Pandas DataFrame containing parsed Project MUSE data. + """ + if source == 'request_input': source = input('HTML: ') @@ -2032,6 +2115,10 @@ def parse_muse_from_source(source = 'request_input', link = None): def scrape_muse(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse Project MUSE article webpages. Takes a Project MUSE URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -2052,6 +2139,22 @@ def scrape_muse(url = 'request_input'): def parse_proquest_from_source(source, link = None): + """ + Parses source HTML from a ProQuest article webpage and returns a Pandas DataFrame. + + Parameters + ---------- + source : str + source HTML. + link : str + link to webpage. + + Returns + ------- + result : pandas.DataFrame + a Pandas DataFrame containing parsed ProQuest data. + """ + if type(source) !=str: raise TypeError('Source must be a string') @@ -2192,6 +2295,10 @@ def parse_proquest_from_source(source, link = None): def scrape_proquest(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse ProQuest article webpages. Takes a ProQuest URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -2212,6 +2319,22 @@ def scrape_proquest(url = 'request_input'): def parse_jstor_from_source(source = 'request_input', link = None): + """ + Parses source HTML from a JSTOR article webpage and returns a Pandas DataFrame. + + Parameters + ---------- + source : str + source HTML. + link : str + link to webpage. + + Returns + ------- + result : pandas.DataFrame + a Pandas DataFrame containing parsed JSTOR data. + """ + if source == 'request_input': source = input('Source code: ') @@ -2334,6 +2457,10 @@ def parse_jstor_from_source(source = 'request_input', link = None): def scrape_jstor(url = 'request_input'): + """ + Bespoke web scraper to scrape and parse JSTOR article webpages. Takes a JSTOR URL and returns a Pandas DataFrame. + """ + if url == 'request_input': url = input('URL: ') @@ -2354,6 +2481,22 @@ def scrape_jstor(url = 'request_input'): def parse_google_scholar_source(source = 'request_input'): + """ + Parses source HTML from a Google Scholar webpage and returns a Pandas DataFrame. + + Parameters + ---------- + source : str + source HTML. + link : str + link to webpage. + + Returns + ------- + result : pandas.DataFrame + a Pandas DataFrame containing parsed JSTOR data. + """ + if source == 'request_input': source = '"""' + input('Source code: ') + '"""' @@ -2450,6 +2593,19 @@ def parse_google_scholar_source(source = 'request_input'): def search_google_scholar(query = 'request_input', pages = 1, open_source = False): + """ + Runs a Google Scholar search in the default web browser. + + Parameters + ---------- + query : str + query to search Google Scholar. Defaults to requesting from user input. + pages : int + maximum number of Google Scholar pages to return + open_source : bool + whether to open the webpage source code. Defaults to False. + """ + if query == None: query = input('Query: ') query = urllib.parse.quote_plus(query) # type: ignore @@ -2464,6 +2620,10 @@ def search_google_scholar(query = 'request_input', pages = 1, open_source = Fals def open_google_scholar_links(source = None): + """ + Takes Google Scholar source HTML and opens each link to a result in the default browser. + """ + if source == None: source = '"""' + input('Source code: ') + '"""' @@ -2474,6 +2634,10 @@ def open_google_scholar_links(source = None): def scrape_google_scholar(url): + """ + Bespoke web scraper to scrape and parse Google Scholar record webpages. Takes a Google Scholar URL and returns a Pandas DataFrame. + """ + if type(url) != str: raise TypeError('Query must be a string') @@ -2492,6 +2656,8 @@ def scrape_google_scholar(url): def scrape_google_scholar_search(query): + """Scrapes Google Scholar from a search query. Returns a Pandas DataFrame.""" + if type(query) != str: raise TypeError('Query must be a string') @@ -2504,6 +2670,22 @@ def scrape_google_scholar_search(query): def iterate_scholar_pages(scholar_page, page_limit = 20): + """ + Iteratively scrapes Google Scholar search pages. + + Parameters + ---------- + scholar_page : str + URL for Google Scholar search. + page_limit : int + maximum number of pages to scrape. + + Returns + ------- + df : pandas.DataFrame + a Pandas DataFrame containing scraped Google Scholar results. + """ + global results_cols df = pd.DataFrame(columns = results_cols, dtype=object) @@ -2534,6 +2716,10 @@ def iterate_scholar_pages(scholar_page, page_limit = 20): def scrape_doi(doi): + """ + Bespoke web scraper to scrape and parse webpages from a DOI or doi.org URL. Takes a DOI or doi.org URL and returns a Pandas DataFrame. + """ + if doi.startswith('www.doi.org/') == True: doi = doi.replace('www.doi.org/', 'https://doi.org/') @@ -2550,6 +2736,34 @@ def scrape_doi(doi): def scrape_article(url = 'request_input') -> pd.DataFrame: + """ + Scrapes article data from a given URL and adds to Results. + + Parameters + ---------- + url : str + url of article to scrape. Defaults to requesting from user input. + + Notes + ----- + This function is capable of scraping: + * Frontiers + * ArXiv + * Springer + * Nature + * IEEE + * PubMed + * PMC + * SSRN + * HeinOnline + * MDPI + * ACM + * Project Muse + * Proquest + * JSTOR + * Google Scholar + """ + if url == 'request_input': url = input('URL: ') @@ -2612,5 +2826,4 @@ def scrape_article(url = 'request_input') -> pd.DataFrame: -# Automated scraping not functional for CUP, OUP, SAGE, T&F, Science, Wiley, SciDirect, or ResearchGate sites due to bot blockers. -## See notebook for scripts for parsing CUP, OUP, SAGE, T&F, Science, Wiley, SciDirect, and ResearchGate \ No newline at end of file +# Automated scraping not functional for CUP, OUP, SAGE, T&F, Science, Wiley, SciDirect, or ResearchGate sites due to bot blockers. \ No newline at end of file diff --git a/art/internet/search.py b/art/internet/search.py index 07f85ff..7ff3bc7 100644 --- a/art/internet/search.py +++ b/art/internet/search.py @@ -55,7 +55,6 @@ def search_web( else: open_url(url = url) - def multi_search_web(iteration_terms: list = 'request_input', query: str = 'request_input', search_engine: str = 'Google', @@ -137,7 +136,6 @@ def search_website(query: str = 'request_input', url = 'request_input', view_sou # Opening web search search_web(query, search_engine = 'Google', view_source = view_source) - def search_social_media(query: str = 'request_input', platform: str = 'Twitter', view_source = False): """ @@ -201,7 +199,6 @@ def search_social_media(query: str = 'request_input', platform: str = 'Twitter', # Running search search_web(query, search_engine = 'Google', view_source = view_source) - def search_twitter(query: str = 'request_input'): """ @@ -210,7 +207,6 @@ def search_twitter(query: str = 'request_input'): search_social_media(query = query, platform = 'Twitter') - def search_images(query: str = 'request_input', search_engine: str = 'Google Images' ): @@ -251,8 +247,6 @@ def search_images(query: str = 'request_input', # Opening search return webbrowser.open(url) - - def reverse_image_search(url = 'request_input', search_engine: str = 'Google Images' ): diff --git a/art/internet/webanalysis.py b/art/internet/webanalysis.py index 25c9e51..9c9a317 100644 --- a/art/internet/webanalysis.py +++ b/art/internet/webanalysis.py @@ -4,15 +4,16 @@ import copy import re import ipaddress -import socket import webbrowser -import numpy as np -import pandas as pd from courlan import check_url from urllib.parse import quote def is_url(url: str) -> bool: + + """ + Checks if a string is a correctly formatted URL. Returns True if yes; False if no. + """ url_regex = re.compile(r'https?://(?:www\.)?[a-zA-Z0-9./]+') return bool(url_regex.match(url)) @@ -46,7 +47,6 @@ def domain_splitter(web_address: str) -> str: return domain - def is_domain(domain_name: str = 'request_input') -> bool: """ @@ -92,7 +92,6 @@ def is_domain(domain_name: str = 'request_input') -> bool: except: return False - def is_ip_address(string: str = 'request_input') -> bool: """ @@ -167,790 +166,7 @@ def get_domain(url: str) -> str: domain = domain_splitter(url) return domain - -def get_my_ip() -> str: - - """ - Returns user's IP address. - """ - - # Trying to retrieve IP address using socket's gethostbyname() method - try: - hostname=socket.gethostname() - IPAddr=socket.gethostbyname(hostname) - - - except: - - # If socket's gethostbyname() method raises an error, trying to use socket's getfqdn() method - try: - IPAddr = socket.gethostbyname(socket.getfqdn()) - - except: - - # If socket's getfqdn() method raises an error, trying to use socket's connect() and getsockname() methods - - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - IPAddr = s.getsockname()[0] - s.close() - - return IPAddr - -def get_my_ip_geocode(): - - """ - Returns the geocode associated with user's IP address. - """ - - return geocoder.ip('me') - -def get_ip_geocode(ip_address: str = 'request_input'): - - """ - Returns the geocode associated with an IP address. - """ - - # Requesting IP address from user input if none given - if ip_address == 'request_input': - ip_address = input('IP address: ') - - return geocoder.ip(ip_address) - -def get_ip_coordinates(ip_address: str = 'request_input') -> str: - - """ - Returns the coordinates associated with an IP address. - """ - - # Requesting IP address from user input if none given - if ip_address == 'request_input': - ip_address = input('IP address: ') - - # Retrieving geocode - g = geocoder.ip(ip_address) - - # Retrieving coordinates - coords = str(g.latlng) - - return coords - -def get_my_ip_coordinates() -> str: - - """ - Returns the coordinates associated with user's IP address. - """ - - return get_ip_coordinates(ip_address = 'me') - -def get_ip_physical_location(ip_address: str = 'request_input') -> str: - - """ - Returns the address associated with an IP address. - """ - - # Requesting IP address from user input if none given - if ip_address == 'request_input': - ip_address = input('IP address: ') - - # Retrieving address associated with IP's coordinates using Geopy - address = str(geocoder.ip(ip_address).address) - - return address - -def get_my_ip_physical_location() -> str: - - """ - Returns the address associated with user's IP address. - """ - - # Retrieving geocode associated with user's IP - location = get_my_ip_geocode() - - # Retrieving address - address = str(location.address) - - return address - -def lookup_ip_coordinates(ip_address: str = 'request_input', site: str = 'Google Maps'): - - """ - Searches for coordinates associated with an IP address on a chosen mapping platform. - - Parameters - ---------- - ip_address : str - IP address to look up. - site : str - name of mapping platform to use. Defaults to 'Google Maps'. - """ - - # Requesting IP address from user input if none given - if ip_address == 'request_input': - ip_address = input('IP address: ') - - # Retrieving coordinates - coordinates = get_ip_coordinates(ip_address = ip_address) - latitude = coordinates[0] - longitude = coordinates[1] - - # Searching for coordinates on mapping platform - try: - return lookup_coordinates(latitude = latitude, longitude = longitude, site = site) - except: - raise ValueError('Lookup failed. Please check the IP address provided.') - - -def domain_from_ip(ip_address: str) -> pd.Series: - - """ - Returns domain address associated with IP address. - """ - - # Cleaning IP address - ip_address = ip_address.strip().strip('/').strip() - - # Checking if string is a valid IP address - if is_ip_address(ip_address) == True: - - # Trying to retrieve domain - try: - result = socket.gethostbyaddr(ip_address) - except socket.herror: - return "No domain details found" - - # Creating Pandas series for result - output_series = pd.Series(result, index = ['domain_name', 'aliases', 'ip_address']) - - return output_series - - else: - raise ValueError('Address given is not a valid IP address') - -def ip_from_domain(domain: str) -> str: - - """ - Returns IP address associated with domain address. - """ - - # Checking if string is a valid domain address - if is_domain(domain) == False: - try: - domain = domain_splitter(domain) - except: - pass - - if is_domain(domain) == False: - domain = None - - # Trying to retrieve IP address associated with domain - try: - result = str(socket.gethostbyname(domain)) - - except socket.herror: - return "No domain details found" - - return result - - - - - -class WhoisResult: - - """ - This is a class to store WhoIs result data. - - Parameters - ---------- - domain : str - domain to run WhoIs search on. - ip_address : str - IP address to run WhoIs search on. - """ - - def __init__(self, domain: str = None, ip_address: str = None): - - """ - Initialises WhoisResult object. - - Parameters - ---------- - domain : str - domain to run WhoIs search on. - ip_address : str - IP address to run WhoIs search on. - """ - - # Creating results dataframe and assigning as attribute - self.all_results = pd.DataFrame(columns = ['Metadata'], dtype = object) - self.all_results.index.name = 'Category' - - # Cleaning domain if given - if domain != None: - if is_domain(domain) == False: - try: - domain = domain_splitter(domain) - except: - pass - - if is_domain(domain) == False: - domain = None - - # Cleaning IP address if given - if ip_address != None: - ip_address = ip_address.strip().strip('/').strip() - if is_ip_address(ip_address) == False: - ip_address = None - - # If a domain is given but no IP address, tries to retrieve associated IP address - if (domain != None) and (ip_address == None): - try: - ip_address = ip_from_domain(domain) - except: - ip_address = None - - # If an IP address is given but no domain, tries to retrieve associated domain - if (domain == None) and (ip_address != None): - try: - domain = domain_from_ip(ip_address)['domain_name'] - except: - domain = None - - # Assigning domain and IP address as attribute - self.domain = domain - self.ip_address = ip_address - - # Running domain WhoIs lookup - try: - self.domain_whois(domain = domain, ip_address = ip_address) - except: - pass - - # Running IP WhoIs lookup - try: - self.ip_whois(domain = domain, ip_address = ip_address) - except: - pass - - # If IP WhoIs search has been successful, assigns RDAP result as an attribute - try: - self.RDAP_obj = self.IPWhois_obj.lookup_rdap(depth=1) - except: - self.RDAP_obj = None - - # If IP WhoIs search has been successful, assigns RDAP result as an attribute - try: - self.results_dict = self.RDAP_obj - except: - self.results_dict = None - - # If WhoIs search has been successful, assigns rdap_res result to results dataframe - try: - rdap_res = pd.Series(self.RDAP_obj) - df = pd.DataFrame(rdap_res, columns = ['Metadata']) - df.index.name = 'Category' - self.all_results = pd.concat([df, self.all_results]) - except: - pass - - # If WhoIs search has been successful, assigns nir result as an attribute - try: - self.nir = self.all_results.loc['nir', 'Metadata'] - except: - self.nir = None - - # If WhoIs search has been successful, assigns asn result as an attribute - try: - self.asn_registry = self.all_results.loc['asn_registry', 'Metadata'] - except: - self.asn_registry = None - - # If WhoIs search has been successful, assigns asn result as an attribute - try: - self.asn = self.all_results.loc['asn', 'Metadata'] - except: - self.asn = None - - # If WhoIs search has been successful, assigns asn_cidr result as an attribute - try: - self.asn_cidr = self.all_results.loc['asn_cidr', 'Metadata'] - except: - self.asn_cidr = None - - # If WhoIs search has been successful, assigns asn country code result as an attribute - try: - self.asn_country_code = self.all_results.loc['asn_country_code', 'Metadata'] - except: - self.asn_country_code = None - - # If WhoIs search has been successful, assigns asn date country code result as an attribute - try: - self.asn_date = self.all_results.loc['asn_date', 'Metadata'] - except: - self.asn_date = None - - # If WhoIs search has been successful, assigns asn description code result as an attribute - try: - self.asn_description = self.all_results.loc['asn_description', 'Metadata'] - except: - self.asn_description = None - - # If WhoIs search has been successful, assigns query result as an attribute - try: - self.query = self.all_results.loc['query', 'Metadata'] - except: - self.query = None - - # If WhoIs search has been successful, assigns entities result as an attribute - try: - self.entities = self.all_results.loc['entities', 'Metadata'] - except: - self.entities = None - - # If WhoIs search has been successful, assigns raw result as an attribute - try: - self.raw = self.all_results.loc['raw', 'Metadata'] - except: - self.raw = None - - # If WhoIs search has been successful, assigns network result as an attribute - try: - self.network = pd.DataFrame(pd.Series(self.all_results.loc['network', 'Metadata']), columns = ['Metadata']) - self.network.index.name = 'Category' - except: - self.network = None - - # If WhoIs search has been successful, parses network events result and assigns as an attribute - try: - - self.network_events = pd.DataFrame(columns = ['action', 'timestamp', 'actor']) - events = self.network.loc['events', 'Metadata'] - for i in events: - df = pd.DataFrame(pd.Series(i)).T - self.network_events = pd.concat([self.network_events, df]) - - self.network_events = self.network_events.reset_index().drop('index', axis=1) - self.all_results.at['network_events', 'Metadata'] = events - - except: - self.network_events = None - - # If WhoIs search has been successful, parses network notices result and assigns as an attribute - try: - - self.network_notices = pd.DataFrame(columns = ['title', 'description', 'links']) - notices = self.network.loc['notices', 'Metadata'] - - for i in notices: - df = pd.DataFrame(pd.Series(i)).T - self.network_notices = pd.concat([self.network_notices, df]) - - self.network_notices = self.network_notices.reset_index().drop('index', axis=1) - self.all_results.at['network_notices', 'Metadata'] = notices - - except: - self.network_notices = None - - # If WhoIs search has been successful, parses objects result and assigns as an attribute - try: - objects_series = pd.Series(self.all_results.loc['objects', 'Metadata']) - self.objects = pd.DataFrame(dtype = object) - - for i in objects_series: - obj_df = pd.DataFrame(pd.Series(i)).T - self.objects = pd.concat([self.objects, obj_df]).reset_index().drop('index', axis=1) - - except: - self.objects = None - - # If WhoIs search has been successful, parses contacts result and assigns as an attribute - try: - contacts_series = self.objects['contact'] - - self.contacts = pd.DataFrame(dtype = object) - - for i in contacts_series: - contact_df = pd.DataFrame(pd.Series(i)).T - try: - contact_df.loc[0, 'address'] = contact_df.loc[0, 'address'][0]['value'] - except: - None - - try: - contact_df.loc[0, 'phone'] = contact_df.loc[0, 'phone'][0]['value'] - except: - None - - try: - contact_df.loc[0, 'email'] = contact_df.loc[0, 'email'][0]['value'] - except: - None - - self.contacts = pd.concat([self.contacts, contact_df]).reset_index().drop('index', axis=1) - self.all_results.at['contacts', 'Metadata'] = contacts_series.to_list() - except: - self.contacts = None - - # Cleaning up results dataframe - self.all_results = self.all_results.replace(np.nan, None) - self.all_results = self.all_results.reset_index().drop_duplicates(subset = 'Category').set_index('Category') - - - def domain_whois(self, domain: str, ip_address: str): - - """ - Performs a WhoIs lookup on a domain. - """ - - # Checking if domain address given is valid; trying to retrieve domain if not - if domain != None: - if is_domain(domain) == False: - try: - domain = domain_splitter(domain) - except: - pass - - if is_domain(domain) == False: - domain = None - - # Checking if IP address given is valid - if ip_address != None: - if is_ip_address(ip_address) == False: - ip_address = None - - # If only an IP address has been given, tries to retrieve associated domain address - if (domain == None) and (ip_address != None): - try: - domain = domain_from_ip(ip_address)['domain_name'] - except: - domain = None - return - - # Using recursion, runs a WhoIs lookup on the domain that is found - domain_whois(self, domain = domain, ip_address = ip_address) - - # Running WhoIs lookup on the domain given - if domain != None: - - # Trying to run WhoIs search on domain - try: - # Creating dataframe for output and inputting WhoIs result - df = pd.DataFrame(pd.Series(whois.whois(domain)), columns = ['Metadata']) - df.index.name = 'Category' - self.all_results = pd.concat([self.all_results, df]) - self.all_results.loc['domain', 'Metadata'] = domain - - except: - pass - - # Trying to retrieve associated IP address if none given - try: - if ip_address == None: - try: - ip_address = ip_from_domain(domain) - except: - ip_address = None - - self.ip_address = ip_address - self.all_results.loc['ip_address', 'Metadata'] = ip_address - - else: - pass - - except: - pass - - - def ip_whois(self, domain: str, ip_address: str): - - """ - Performs a WhoIs lookup on an IP address. - """ - - # Checking if domain address given is valid; trying to retrieve domain if not - if domain != None: - if is_domain(domain) == False: - try: - domain = domain_splitter(domain) - except: - pass - - if is_domain(domain) == False: - domain = None - - # Checking if IP address given is valid - if ip_address != None: - - ip_address = ip_address.strip().strip('/').strip() - if is_ip_address(ip_address) == False: - ip_address = None - - # If only a domain has been given, tries to retrieve associated IP address - if (domain != None) and (ip_address == None): - try: - ip_address = ip_from_domain(domain) - self.ip_address = ip_address - - # Using recursion, runs a WhoIs lookup on the IP address that is found - ip_whois(self, domain = domain, ip_address = ip_address) - except: - pass - - # Running WhoIs lookup on the IP address given - if ip_address != None: - - # Retrieving domain if none already given/found - if domain == None: - try: - domain = domain_from_ip(ip_address)['domain_name'] - except: - domain = None - self.domain = domain - - # Running domain WhoIs on domain - try: - domain_whois(self, domain = domain, ip_address = ip_address) - except: - pass - - # Formatting results as pandas.DataFrame - try: - domain_reverse_search = pd.DataFrame(domain_from_ip(ip_address), columns = ['Metadata']) - domain_reverse_search.index.name = 'Category' - self.all_results = pd.concat([self.all_results, domain_reverse_search]) - except: - pass - - # Running WhoIs lookup on IP address - try: - self.IPWhois_obj = IPWhois(ip_address) - except: - pass - - # Retrieving geocode associated with IP address using Geopy - try: - self.ip_geocode = get_ip_geocode(ip_address) - self.all_results.at['ip_geocode', 'Metadata'] = self.ip_geocode - except: - pass - - # Retrieving coordinates associated with IP address using Geopy - try: - self.ip_coordinates = get_ip_coordinates(ip_address) - self.all_results.at['ip_coordinates', 'Metadata'] = self.ip_coordinates - except: - pass - - # Retrieving location associated with IP address using Geopy - try: - self.ip_location = get_ip_physical_location(ip_address) - self.all_results.at['ip_location', 'Metadata'] = self.ip_location - except: - pass - - # Assigning results to WhoisResult object - try: - self.all_results.at['ip_address', 'Metadata'] = ip_address - except: - None - - else: - self.IPWhois_obj = None - - - - def __repr__(self): - - """Controls how WhoIsResult objects are represented in string form.""" - - return str(self.all_results) - - def contents(self): - - """Returns a list of object contents.""" - - return list(self.__dict__.keys()) - - def copy(self): - - """Creates a copy of object.""" - - return copy.deepcopy(self) - -def domain_whois(domain: str = 'request_input'): - - """ - Performs a WhoIs lookup on a domain address. - """ - - # Requesting domain address from user input if none given - if domain == 'request_input': - domain = input('Domain: ') - - # Raising error if domain given is not a string - if type(domain) != str: - raise TypeError('Domain must be a string') - - # Checking if domain is valid; if not, tries to extract domain - if is_domain(domain) == False: - domain = domain_splitter(domain) - - # Re-checking if domain is valid; if true, creates WhoisResult object - if is_domain(domain) == True: - return WhoisResult(domain = domain) - - else: - return None - -def domains_whois(domains_list: List[str]): - - """ - Performs a WhoIs lookup on a list or set of domain addresses. - """ - - # Creating output dataframe - output_df = pd.DataFrame(dtype=object) - - # Iterating through domains - for domain in domains_list: - - if domain == None: - df = pd.DataFrame() - - else: - # If domain is valid, running WhoIs lookup using WhoisResult class - if is_domain(domain) == True: - try: - df = domain_whois(domain = domain).all_results.T - except: - continue - else: - # If domain is not valid, trying to retrieve domain - try: - domain = domain_splitter(domain) - except: - continue - - # If domain is valid, running WhoIs lookup using WhoisResult class - if is_domain(domain) == True: - try: - df = domain_whois(domain = domain).all_results.T - except: - continue - - # Concatanating results dataframes - output_df = pd.concat([output_df, df]) - - # Cleaning and reformatting output dataframe - output_df = output_df.replace(np.nan, None).reset_index().drop('index', axis=1) - output_df = output_df.set_index('domain') - output_df.index.rename('domain') - output_df.columns.name = None - - return output_df - - -def ip_whois(ip_address: str = 'request_input'): - - """ - Performs a WhoIs lookup on an IP address. - """ - - # Requesting domain address from user input if none given - if ip_address == 'request_input': - ip_address = input('IP address: ') - - # Cleaning IP address - ip_address = ip_address.strip().strip('/').strip() - - # Checking if IP address is valid; if true, running WhoIs lookup using WhoisResult class - if is_ip_address(ip_address) == True: - return WhoisResult(ip_address = ip_address) - - else: - return None - -def ips_whois(ip_addresses = List[str]): - - """ - Performs a WhoIs lookup on a list of IP addresses. - """ - - # Creating output dataframe - output_df = pd.DataFrame(dtype=object) - - # Iterating through IP addresses - for ip in ip_addresses: - - if ip == None: - df = pd.DataFrame() - - # If IP address is inputted, tries to run a WhoIs lookup using WhoisResult class - else: - - try: - df = ip_whois(ip_address = ip).all_results.T - except: - df = pd.DataFrame() - - # Concatanating results dataframes - output_df = pd.concat([df, output_df]) - - # Cleaning and reformatting output dataframe - output_df = output_df.replace(np.nan, None).reset_index().drop('index', axis=1) - output_df = output_df.set_index('ip_address') - output_df.index.rename('ip_address') - output_df.columns.name = None - - return output_df - - -def lookup_whois(string: str = 'request_input'): - - """ - Performs a WhoIs lookup on an inputted domain or IP address. - """ - - # Requesting string from user input if none given - if string == 'request_input': - string = input('Search query: ') - - # Raising error if input is not a string object - if type(string) != str: - raise TypeError('Search query must be a string') - - # Making a copy of string - original_string = copy.deepcopy(string) - - # Cleaning string - string = string.strip().strip('/').strip('\\').strip('#').strip('.').strip('-').strip() - - # If string is an IP address, runs IP WhoIs lookup - if is_ip_address(string) == True: - try: - return ip_whois(string) - except: - return - - # If string is not an IP address and is not a valid domain, tries to retrieve domain - if is_domain(string) == False: - string = domain_splitter(string) - - # If string is a domain, runs domain WhoIs lookup - if is_domain(string) == True: - try: - return domain_whois(domain = string) - except: - return - - else: - raise ValueError(f'Search query "{original_string}" contains neither a valid domain name nor a valid IP address') - - - - ## Functions for opening links and addresses def open_url(url = 'request_input'): @@ -968,6 +184,10 @@ def open_url(url = 'request_input'): def open_doi(doi = 'request_input'): + """ + Opens DOI in the default web browser. + """ + if doi == 'request_input': doi = input('DOI: ') diff --git a/art/networks/__pycache__/__init__.cpython-39.pyc b/art/networks/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..510c1ac Binary files /dev/null and b/art/networks/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/networks/__pycache__/network_functions.cpython-39.pyc b/art/networks/__pycache__/network_functions.cpython-39.pyc new file mode 100644 index 0000000..a48d3ce Binary files /dev/null and b/art/networks/__pycache__/network_functions.cpython-39.pyc differ diff --git a/art/networks/network_functions.py b/art/networks/network_functions.py index 757ab31..4b3bf43 100644 --- a/art/networks/network_functions.py +++ b/art/networks/network_functions.py @@ -627,7 +627,15 @@ def generate_funder_works_network(funder_works_dict: dict) -> Graph: return g def cocitation_dict(citation_network) -> dict: - + + """ + Generates a dictionary representing co-citations from a citation network. + + Notes + ----- + Is able to take igraph.Graph, Network, and NetworkX objects. + """ + # Converting NetworkX objects to igraph objects if ( (type(citation_network) == NetworkX_Undir) diff --git a/art/text/__pycache__/__init__.cpython-39.pyc b/art/text/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..fd5d2ed Binary files /dev/null and b/art/text/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/text/__pycache__/textanalysis.cpython-39.pyc b/art/text/__pycache__/textanalysis.cpython-39.pyc new file mode 100644 index 0000000..e8dfb9e Binary files /dev/null and b/art/text/__pycache__/textanalysis.cpython-39.pyc differ diff --git a/art/utils/__pycache__/__init__.cpython-39.pyc b/art/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..3f57d2e Binary files /dev/null and b/art/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/art/utils/__pycache__/basics.cpython-39.pyc b/art/utils/__pycache__/basics.cpython-39.pyc new file mode 100644 index 0000000..3072760 Binary files /dev/null and b/art/utils/__pycache__/basics.cpython-39.pyc differ diff --git a/art/utils/__pycache__/cleaners.cpython-39.pyc b/art/utils/__pycache__/cleaners.cpython-39.pyc new file mode 100644 index 0000000..8cbe2af Binary files /dev/null and b/art/utils/__pycache__/cleaners.cpython-39.pyc differ diff --git a/art/utils/basics.py b/art/utils/basics.py index 3a344c8..e0a2388 100644 --- a/art/utils/basics.py +++ b/art/utils/basics.py @@ -5,6 +5,7 @@ from os import stat_result import os import sys +import pickle results_cols = [ 'work_id', @@ -50,9 +51,19 @@ def blockPrint(): + + """ + Blocks command line interface from printing text via print() and similar functions. Follow with enablePrint() to allow printing. + """ + sys.stdout = open(os.devnull, 'w') def enablePrint(): + + """ + Enables command line interface to print text via print() and similar functions. Must be used after blockPrint(). + """ + sys.stdout = sys.__stdout__ class Iterator: @@ -99,7 +110,6 @@ def __next__(self): raise StopIteration - def dict_to_str(item: dict) -> str: """ @@ -259,4 +269,19 @@ def stat_file_to_dict(self): return dictionary +def open_file(file_address: str = 'request_input'): # type: ignore + + """ + Reads saved ART files and returns as a Review object. Reviews must be formatted as .review or pickled .txt files. + """ + + if file_address == 'request_input': + file_address = input('File address: ') + + if (file_address.endswith('.txt')) or (file_address.endswith('.review')): + with open(file_address, 'rb') as f: + review = pickle.load(f) + + return review + stat_result.to_dict = stat_file_to_dict # type: ignore \ No newline at end of file diff --git a/art/utils/cleaners.py b/art/utils/cleaners.py index a171f23..5384aa8 100644 --- a/art/utils/cleaners.py +++ b/art/utils/cleaners.py @@ -1,7 +1,7 @@ """Functions for parsing, cleaning, and normalising data.""" # Importing packages - +from .basics import blockPrint, enablePrint from ..datasets import stopwords, html_stopwords from typing import List, Dict, Tuple @@ -17,9 +17,15 @@ import nltk from nltk.tokenize import word_tokenize, sent_tokenize # type: ignore +blockPrint() nltk.download('punkt') +enablePrint() def join_list_by_colon(item): + + """ + Takes list and returns as a string separated by a comma followed by a space. Used as input for Pandas.apply. + """ if type(item) == list: return ', '.join(item) @@ -29,6 +35,10 @@ def join_list_by_colon(item): def join_df_col_lists_by_colon(dataframe): + """ + Takes a Pandas DataFrame and converts lists to strings separated by a comma followed by a space. + """ + for col in dataframe.columns: dataframe[col] = dataframe[col].apply(join_list_by_colon) @@ -36,6 +46,10 @@ def join_df_col_lists_by_colon(dataframe): def split_str_by_colon(item): + """ + Splits a string into a list by semi-colons. Used as input for Pandas.apply. + """ + if type(item) == str: return item.split(',') @@ -1283,6 +1297,22 @@ def correct_series_of_lists(series: pd.Series) -> pd.Series: def merge_duplicate_ids(dataframe, merge_on: str): + """ + Takes a DataFrame and merges rows with duplicate IDs. + + Parameters + ---------- + dataframe : Results, References or pandas.DataFrame + dataframe to process. + merge_on : str + name of column containing IDs to merge on. + + Returns + ------- + dataframe : Results, References or pandas.DataFrame + processed DataFrame. + """ + df = dataframe.copy(deep=True) if merge_on in df.columns: @@ -1377,10 +1407,26 @@ def merge_duplicate_ids(dataframe, merge_on: str): def merge_all_duplicate_ids(dataframe): - id_names = ['doi', 'isbn', 'issn', 'uri', 'orcid', 'crossref_id', 'crossref', 'scopus_id', 'scopus', 'wos_id', 'wos', 'pubmed_id', 'address', 'link', 'website'] - df = dataframe.copy(deep=True) + """ + Takes a DataFrame and merges rows with duplicate bibliometric IDs. + Parameters + ---------- + dataframe : Results, References or pandas.DataFrame + dataframe to process. + + Returns + ------- + dataframe : Results, References or pandas.DataFrame + processed DataFrame. + + Notes + ----- + Bibliometric identifiers used to check for duplicate records: DOI, ISBN, ISSN, URI, ORCID, CrossRef ID, Scopus, Web of Science, PubMed, address, URL, website. + """ + id_names = ['doi', 'isbn', 'issn', 'uri', 'orcid', 'crossref_id', 'crossref', 'scopus_id', 'scopus', 'wos_id', 'wos', 'pubmed_id', 'address', 'link', 'website'] + df = dataframe.copy(deep=True) for i in id_names: if i in dataframe.columns: @@ -1390,6 +1436,10 @@ def merge_all_duplicate_ids(dataframe): def deduplicate(dataframe): + """ + Deduplicates custom ART DataFrames (Results, References, Authors.summary, Funders.summary, Affiliations.summary) using unique identifiers. + """ + ignore_cols = ['work_id', 'author_id', 'funder_id', diff --git a/pyproject.toml b/pyproject.toml index 79889dd..171406b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "academic_review_tool" -version = "1.0.9" +version = "1.1.0" description = "The Academic Review Tool (ART) is a package for performing academic reviews and bibliometric analyses in Python. It offers capabilities for discovering, retrieving, and analysing academic literature at scale." readme = "README.md" authors = [