Skip to content

Commit

Permalink
Merge pull request #110 from pepkit/dev
Browse files Browse the repository at this point in the history
V0.12.0
  • Loading branch information
khoroshevskyi authored Jan 23, 2023
2 parents 13f54d7 + 4ff185e commit e38a70d
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 10 deletions.
3 changes: 3 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## [0.12.0] -- 2023-01-23
- Added functionality that saves gse metadata to config file

## [0.11.2] -- 2022-12-25
- Changed sample_name of PEP of processed files to file oriented
- Added `--max-soft-size` argument, that sets size limit of soft files
Expand Down
2 changes: 1 addition & 1 deletion geofetch/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.2"
__version__ = "0.12.0"
2 changes: 2 additions & 0 deletions geofetch/config_processed_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ pep_version: 2.1.0
project_name: {project_name}
sample_table: {sample_table}

{project_metadata}

sample_modifiers:
append:
output_file_path: FILES
Expand Down
2 changes: 2 additions & 0 deletions geofetch/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
name: {project_name}
pep_version: 2.1.0
sample_table: {annotation}

{project_metadata}
{subannotation}

{sample_modifier_str}
Expand Down
72 changes: 64 additions & 8 deletions geofetch/geofetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
_separate_file_url,
_filter_gsm,
_unify_list_keys,
gse_content_to_dict,
)

from rich.progress import track
Expand Down Expand Up @@ -416,6 +417,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
file_gse_content = gse_file_obj.read().split("\n")
file_gse_content = [elem for elem in file_gse_content if len(elem) > 0]

file_gse_content_dict = gse_content_to_dict(file_gse_content)

if not os.path.isfile(file_gsm) or self.refresh_metadata:
file_gsm_content = Accession(acc_GSE).fetch_metadata(
file_gsm,
Expand Down Expand Up @@ -453,7 +456,10 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
# generating PEPs for processed files:
if self.acc_anno:
self._generate_processed_meta(
acc_GSE, meta_processed_samples, meta_processed_series
acc_GSE,
meta_processed_samples,
meta_processed_series,
gse_meta_dict=file_gse_content_dict,
)

else:
Expand Down Expand Up @@ -498,6 +504,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
name=acc_GSE,
metadata_dict=gsm_metadata,
subannot_dict=gsm_multi_table,
gse_meta_dict=file_gse_content_dict,
)

else:
Expand All @@ -520,6 +527,9 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
name=self.project_name,
meta_processed_samples=processed_metadata_samples,
meta_processed_series=processed_metadata_series,
gse_meta_dict=file_gse_content_dict
if len(acc_GSE_list.keys()) == 1
else None,
)
if self.just_object:
return return_value
Expand All @@ -530,6 +540,9 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
f"{self.project_name}_PEP",
metadata_dict_combined,
subannotation_dict_combined,
gse_meta_dict=file_gse_content_dict
if len(acc_GSE_list.keys()) == 1
else None,
)
if self.just_object:
return return_value
Expand Down Expand Up @@ -706,14 +719,20 @@ def fetch_processed_one(
return meta_processed_samples, meta_processed_series

def _generate_processed_meta(
self, name: str, meta_processed_samples: list, meta_processed_series: list
self,
name: str,
meta_processed_samples: list,
meta_processed_series: list,
gse_meta_dict: Union[dict, None] = None,
) -> dict:
"""
Generate and save PEPs for processed accessions. GEO has data in GSE and GSM,
conditions are used to decide which PEPs will be saved.
:param name: name of the folder/file where PEP will be saved
:param meta_processed_samples:
:param meta_processed_series:
:param gse_meta_dict: dict of metadata fetched from one experiment.
Used to add this data to config file.
:return: dict of objects if just_object is set, otherwise dicts of None
"""
return_objects = {f"{name}_samples": None, f"{name}_series": None}
Expand All @@ -729,6 +748,7 @@ def _generate_processed_meta(
meta_processed_samples,
pep_acc_path_sample,
just_object=self.just_object,
gse_meta_dict=gse_meta_dict,
)

# series
Expand All @@ -753,6 +773,7 @@ def _generate_processed_meta(
meta_processed_samples,
pep_acc_path_sample,
just_object=self.just_object,
gse_meta_dict=gse_meta_dict,
)
elif self.supp_by == "series":
return_objects[f"{name}_series"] = pep_acc_path_exp = os.path.join(
Expand Down Expand Up @@ -957,12 +978,15 @@ def _write_processed_annotation(
processed_metadata: list,
file_annotation_path: str,
just_object: bool = False,
gse_meta_dict: dict = None,
) -> Union[NoReturn, peppy.Project]:
"""
Save annotation file by providing list of dictionaries with files metadata
:param list processed_metadata: list of dictionaries with files metadata
:param str file_annotation_path: the path to the metadata file that has to be saved
:type just_object: True, if you want to get peppy object without saving file
:param just_object: True, if you want to get peppy object without saving file
:param gse_meta_dict: dict of metadata fetched from one experiment.
Used to add this data to config file.
:return: none, or peppy project
"""
if len(processed_metadata) == 0:
Expand Down Expand Up @@ -991,7 +1015,9 @@ def _write_processed_annotation(
self.attr_limit_truncate,
)

template = self._create_config_processed(file_annotation_path, proj_meta)
template = self._create_config_processed(
file_annotation_path, proj_meta, meta_in_series=gse_meta_dict
)

if not just_object:
with open(file_annotation_path, "w") as m_file:
Expand Down Expand Up @@ -1044,14 +1070,19 @@ def _find_genome(metadata_list: list) -> list:
return metadata_list

def _write_raw_annotation_new(
self, name, metadata_dict: dict, subannot_dict: dict = None
self,
name,
metadata_dict: dict,
subannot_dict: dict = None,
gse_meta_dict: dict = None,
) -> Union[None, peppy.Project]:
"""
Combine individual accessions into project-level annotations, and writing
individual accession files (if requested)
:param name: Name of the run, project, or acc --> will influence name of the folder where project will be created
:param metadata_dict: dictionary of sample annotations
:param subannot_dict: dictionary of subsample annotations
:param gse_meta_dict: dict of experiment metadata that was sotred in gse
:return: none or peppy object
"""
try:
Expand Down Expand Up @@ -1101,7 +1132,7 @@ def _write_raw_annotation_new(
subanot_path_yaml = f""

template = self._create_config_raw(
proj_meta, proj_root_sample, subanot_path_yaml
proj_meta, proj_root_sample, subanot_path_yaml, gse_meta_dict
)

if not self.just_object:
Expand Down Expand Up @@ -1137,12 +1168,16 @@ def _write_raw_annotation_new(
return proj

def _create_config_processed(
self, file_annotation_path: str, proj_meta: list
self,
file_annotation_path: str,
proj_meta: list,
meta_in_series: dict = True,
) -> str:
"""
Compose and generate config file content
:param file_annotation_path: root to the annotation file
:param proj_meta: common metadata that has to added to config file
:param meta_in_series:
:return: generated, complete config file content
"""
geofetchdir = os.path.dirname(__file__)
Expand All @@ -1154,25 +1189,37 @@ def _create_config_processed(
for i in proj_meta
]
modifiers_str = "\n ".join(d for d in meta_list_str)

# series metadata
if not meta_in_series:
project_metadata = ""
else:
meta_list_str = {i: j for i, j in meta_in_series.items()}
project_metadata = yaml.dump(meta_list_str, default_style='"')

template_values = {
"project_name": self.project_name,
"sample_table": os.path.basename(file_annotation_path),
"geo_folder": self.geo_folder,
"pipeline_samples": self.file_pipeline_samples,
"pipeline_project": self.file_pipeline_project,
"additional_columns": modifiers_str,
"project_metadata": project_metadata,
}
for k, v in template_values.items():
placeholder = "{" + str(k) + "}"
template = template.replace(placeholder, str(v))
return template

def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml):
def _create_config_raw(
self, proj_meta, proj_root_sample, subanot_path_yaml, meta_in_series=None
):
"""
Compose and generate config file content for raw data
:param proj_meta: root to the annotation file
:param proj_root_sample: path to sampletable file
:param subanot_path_yaml: path to subannotation file
:param meta_in_series:
:return: generated, complete config file content
"""
meta_list_str = [
Expand All @@ -1195,6 +1242,14 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml):
sra_convert_template = template_file.read()
else:
sra_convert_template = ""

# series metadata
if not meta_in_series:
project_metadata = ""
else:
meta_list_str = {i: j for i, j in meta_in_series.items()}
project_metadata = yaml.dump(meta_list_str, default_style='"')

with open(self.config_template, "r") as template_file:
template = template_file.read()
template_values = {
Expand All @@ -1206,6 +1261,7 @@ def _create_config_raw(self, proj_meta, proj_root_sample, subanot_path_yaml):
"pipeline_project": self.file_pipeline_project,
"additional_columns": modifiers_str,
"sra_convert": sra_convert_template,
"project_metadata": project_metadata,
}
for k, v in template_values.items():
placeholder = "{" + str(k) + "}"
Expand Down
22 changes: 22 additions & 0 deletions geofetch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,3 +722,25 @@ def _unify_list_keys(processed_meta_list: list) -> list:
if k not in processed_meta_list[list_elem]:
processed_meta_list[list_elem][k] = ""
return processed_meta_list


def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]:
"""
Unpack gse soft file to dict
:param gse_content: list of strings of gse soft file
:return: dict of gse content
"""
gse_dict = {}
for line in gse_content:
if line.startswith("^"):
pass
elif line.startswith("!"):
key_value = line.split(" = ")
new_key = _sanitize_name(key_value[0][1:])
new_value = _sanitize_config_string(" ".join(key_value[1:]))
if new_key in gse_dict.keys():
gse_dict[new_key] = f"{gse_dict[new_key]} + {new_value}"
else:
gse_dict[new_key] = new_value

return {"experiment_metadata": gse_dict}
2 changes: 1 addition & 1 deletion requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ logmuse>=0.2.6
ubiquerg>=0.6.2
requests>=2.28.1
xmltodict>=0.13.0
pandas>=1.3.5
pandas>=1.5.3
peppy>=0.35.3
rich>=12.5.1
coloredlogs>=15.0.1

0 comments on commit e38a70d

Please sign in to comment.