-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7f0db0f
commit 7f7e89b
Showing
2 changed files
with
182 additions
and
96 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,89 +1,129 @@ | ||
#!/bin/bash | ||
#SBATCH -c 64 | ||
#SBATCH -t 0 | ||
#SBATCH -p <insert partition here> | ||
#SBATCH -p eddy | ||
#SBATCH --mem=64000 | ||
#SBATCH -o logs/rna3db_full_release_%j.out | ||
#SBATCH -e logs/rna3db_full_release_%j.err | ||
#SBATCH --mail-user=<insert email address here> | ||
#SBATCH --mail-user=marcellszikszai@fas.harvard.edu | ||
#SBATCH --mail-type=ALL | ||
|
||
# where you want the release to be output to | ||
OUTPUT_DIR="" | ||
OUTPUT_DIR="/n/eddy_lab/users/mszikszai/rna3db/data/test_auto_release/output" | ||
|
||
# you set these once and forget | ||
RNA3DB_ROOT_DIR="" | ||
MMCIF_DIR="" | ||
CMSCAN="" | ||
CMDB="" | ||
RNA3DB_ROOT_DIR="/n/eddy_lab/users/mszikszai/rna3db/" | ||
PDB_MMCIF_DIR="/n/eddy_lab/users/mszikszai/rna3db/data/test_auto_release/old" | ||
CMSCAN="/n/eddy_lab/users/mszikszai/infernal/binaries/cmscan" | ||
CMDB="/n/eddy_lab/users/mszikszai/rna3db-scans/rfams/14.10/Rfam.cm" | ||
|
||
# set useful constants | ||
NEW_RELEASE_DATE=$(date +"%Y-%m-%d") | ||
mkdir -p $OUTPUT_DIR/$NEW_RELEASE_DATE | ||
JSON_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons | ||
MMCIF_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
CMSCAN_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
FASTA_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-fastas | ||
UPLOAD_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/uploads | ||
|
||
# make directories | ||
mkdir -p $JSON_PATH | ||
mkdir -p $MMCIF_PATH | ||
mkdir -p $CMSCAN_PATH | ||
mkdir -p $FASTA_PATH | ||
mkdir -p $UPLOAD_PATH | ||
|
||
# prepare the env | ||
mamba activate rna3db | ||
|
||
# download latest mmCIF files | ||
bash $RNA3DB_ROOT_DIR/scripts/download_pdb_mmcif.sh $MMCIF_DIR | ||
echo "Downloading..." | ||
#bash $RNA3DB_ROOT_DIR/scripts/download_pdb_mmcif.sh $PDB_MMCIF_DIR | ||
echo "Finished downloading." | ||
|
||
# run parse | ||
echo "Parsing..." | ||
python -m rna3db parse \ | ||
$MMCIF_DIR \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/parse.json | ||
$PDB_MMCIF_DIR \ | ||
$JSON_PATH/parse.json | ||
echo "Finished parsing." | ||
|
||
# run filter | ||
python -m rna3db filter $MMCIF_DIR $OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/filter.json | ||
echo "Filtering..." | ||
python -m rna3db filter \ | ||
$JSON_PATH/parse.json \ | ||
$JSON_PATH/filter.json | ||
echo "Finished filtering." | ||
|
||
# write all sequences to a FASTA file | ||
echo "Building FASTA..." | ||
python $RNA3DB_ROOT_DIR/scripts/json_to_fasta.py \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/parse.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
$JSON_PATH/parse.json \ | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta | ||
echo "Finished building FASTA." | ||
|
||
# do cmscan on all new sequences | ||
CMSCAN --cpu 64 \ | ||
-o $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.o \ | ||
-tbl $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.tbl \ | ||
echo "Running cmscan..." | ||
$CMSCAN --cpu 64 \ | ||
-o $CMSCAN_PATH/$NEW_RELEASE_DATE.o \ | ||
--tbl $CMSCAN_PATH/$NEW_RELEASE_DATE.tbl \ | ||
$CMDB \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta | ||
echo "Finished cmscan." | ||
|
||
# find all sequences that did not get a hit | ||
echo "Finding sequences without hits..." | ||
python $RNA3DB_ROOT_DIR/scripts/get_nohits.py \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.fasta \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/ | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta \ | ||
$FASTA_PATH/$NEW_RELEASE_DATE-nohits.fasta \ | ||
$CMSCAN_PATH | ||
echo "Finished finding sequences." | ||
|
||
# re-scan all sequences with --max that did not get a hit | ||
CMSCAN --max --cpu 64 \ | ||
-o $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.o \ | ||
-tbl $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.tbl \ | ||
echo "Running cmscan (no hits)..." | ||
$CMSCAN --max --cpu 64 \ | ||
-o $CMSCAN_PATH/$NEW_RELEASE_DATE-nohits.o \ | ||
--tbl $CMSCAN_PATH/$NEW_RELEASE_DATE-nohits.tbl \ | ||
$CMDB \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
$FASTA_PATH/$NEW_RELEASE_DATE-nohits.fasta | ||
echo "Finished cmscan (no hits)." | ||
|
||
# run cluster | ||
echo "Clustering..." | ||
python -m rna3db cluster \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/filter.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/cluster.json \ | ||
--tbl_dir $OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
$JSON_PATH/filter.json \ | ||
$JSON_PATH/cluster.json \ | ||
--tbl_dir $CMSCAN_PATH | ||
echo "Cleaning up..." | ||
rm -r $JSON_PATH/mmseqs2_* | ||
echo "Finished clustering." | ||
|
||
# run split | ||
echo "Splitting..." | ||
python -m rna3db split \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/cluster.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/split.json | ||
$JSON_PATH/cluster.json \ | ||
$JSON_PATH/split.json | ||
echo "Finished splitting." | ||
|
||
# make mmCIFs | ||
echo "Making mmCIFs..." | ||
python scripts/json_to_mmcif.py \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/split.json \ | ||
/Users/marcell/Documents/rna3db/data/pdb_mmcif/ \ | ||
$MMCIF_DIR \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
$JSON_PATH/split.json \ | ||
$PDB_MMCIF_DIR \ | ||
$MMCIF_PATH | ||
echo "Finished writing mmCIFs." | ||
|
||
# compress files ready for release | ||
tar -czvf \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans.tar.gz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
tar -czvf \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans.tar.gz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
tar -cfvJ \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs.tar.xz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
echo "Compressing..." | ||
tar -czf \ | ||
$UPLOAD_PATH/rna3db-cmscans.tar.gz \ | ||
$CMSCAN_PATH | ||
tar -czf \ | ||
$UPLOAD_PATH/rna3db-jsons.tar.gz \ | ||
$JSON_PATH | ||
tar -cJf \ | ||
$UPLOAD_PATH/rna3db-mmcifs.tar.xz \ | ||
$MMCIF_PATH | ||
|
||
echo "Finished." | ||
echo $NEW_RELEASE_DATE "release ready for upload." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,91 +1,137 @@ | ||
#!/bin/bash | ||
#SBATCH -c 64 | ||
#SBATCH -t 0 | ||
#SBATCH -p <insert partition here> | ||
#SBATCH -p eddy | ||
#SBATCH --mem=64000 | ||
#SBATCH -o logs/rna3db_full_release_%j.out | ||
#SBATCH -e logs/rna3db_full_release_%j.err | ||
#SBATCH --mail-user=<insert email address here> | ||
#SBATCH -o logs/rna3db_incremental_release_%j.out | ||
#SBATCH -e logs/rna3db_incremental_release_%j.err | ||
#SBATCH --mail-user=marcellszikszai@fas.harvard.edu | ||
#SBATCH --mail-type=ALL | ||
|
||
# the output dir, along with the date of the last release | ||
OUTPUT_DIR="" | ||
PREVIOUS_RELEASE_DATE="2024-04-26" | ||
# where you want the release to be output to | ||
OUTPUT_DIR="/n/eddy_lab/users/mszikszai/rna3db/data/test_auto_release/output" | ||
# set where the latest release is located | ||
OLD_RELEASE=$OUTPUT_DIR/2024-05-14 | ||
|
||
# you set these once and forget | ||
RNA3DB_ROOT_DIR="" | ||
MMCIF_DIR="" | ||
CMSCAN="" | ||
CMDB="" | ||
RNA3DB_ROOT_DIR="/n/eddy_lab/users/mszikszai/rna3db/" | ||
PDB_MMCIF_DIR="/n/eddy_lab/users/mszikszai/rna3db/data/test_auto_release/new" | ||
CMSCAN="/n/eddy_lab/users/mszikszai/infernal/binaries/cmscan" | ||
CMDB="/n/eddy_lab/users/mszikszai/rna3db-scans/rfams/14.10/Rfam.cm" | ||
|
||
# set useful constants | ||
NEW_RELEASE_DATE=$(date +"%Y-%m-%d") | ||
mkdir -p $OUTPUT_DIR/$NEW_RELEASE_DATE | ||
JSON_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons | ||
MMCIF_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
CMSCAN_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
FASTA_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-fastas | ||
UPLOAD_PATH=$OUTPUT_DIR/$NEW_RELEASE_DATE/uploads | ||
|
||
# make directories | ||
mkdir -p $JSON_PATH | ||
mkdir -p $MMCIF_PATH | ||
mkdir -p $CMSCAN_PATH | ||
mkdir -p $FASTA_PATH | ||
mkdir -p $UPLOAD_PATH | ||
|
||
# prepare the env | ||
mamba activate rna3db | ||
|
||
# download latest mmCIF files | ||
bash $RNA3DB_ROOT_DIR/scripts/download_pdb_mmcif.sh $MMCIF_DIR | ||
echo "Downloading..." | ||
#bash $RNA3DB_ROOT_DIR/scripts/download_pdb_mmcif.sh $PDB_MMCIF_DIR | ||
echo "Finished downloading." | ||
|
||
# run parse | ||
echo "Parsing..." | ||
python -m rna3db parse \ | ||
$MMCIF_DIR \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/parse.json | ||
$PDB_MMCIF_DIR \ | ||
$JSON_PATH/parse.json | ||
echo "Finished parsing." | ||
|
||
# run filter | ||
python -m rna3db filter $MMCIF_DIR $OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/filter.json | ||
echo "Filtering..." | ||
python -m rna3db filter \ | ||
$JSON_PATH/parse.json \ | ||
$JSON_PATH/filter.json | ||
echo "Finished filtering." | ||
|
||
# write only the new sequences to a FASTA file | ||
echo "Building FASTA..." | ||
python $RNA3DB_ROOT_DIR/scripts/build_incremental_release_fasta.py \ | ||
$OUTPUT_DIR/$OLD_RELEASE_DATE/rna3db-jsons/parse.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/parse.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
|
||
# do cmscan on all new sequences | ||
CMSCAN --cpu 64 \ | ||
-o $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.o \ | ||
-tbl $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.tbl \ | ||
$OLD_RELEASE/rna3db-jsons/parse.json \ | ||
$JSON_PATH/parse.json \ | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta | ||
echo "Finished building FASTA." | ||
|
||
echo "Copying old cmscans..." | ||
cp $OLD_RELEASE/rna3db-cmscans/*.tbl $CMSCAN_PATH/ | ||
cp $OLD_RELEASE/rna3db-cmscans/*.o $CMSCAN_PATH/ | ||
echo "Copied old cmscans." | ||
|
||
# do cmscan on new sequences | ||
echo "Running cmscan..." | ||
$CMSCAN --cpu 64 \ | ||
-o $CMSCAN_PATH/$NEW_RELEASE_DATE.o \ | ||
--tbl $CMSCAN_PATH/$NEW_RELEASE_DATE.tbl \ | ||
$CMDB \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta | ||
echo "Finished cmscan." | ||
|
||
# find new sequences that did not get a hit | ||
# find sequences that did not get a hit | ||
echo "Finding sequences without hits..." | ||
python $RNA3DB_ROOT_DIR/scripts/get_nohits.py \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.fasta \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/ | ||
|
||
# re-scan new sequences with --max that did not get a hit | ||
CMSCAN --max --cpu 64 \ | ||
-o $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.o \ | ||
-tbl $OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE-nohits.tbl \ | ||
$FASTA_PATH/$NEW_RELEASE_DATE.fasta \ | ||
$FASTA_PATH/$NEW_RELEASE_DATE-nohits.fasta \ | ||
$CMSCAN_PATH | ||
echo "Finished finding sequences." | ||
|
||
# re-scan sequences with --max that did not get a hit | ||
echo "Running cmscan (no hits)..." | ||
$CMSCAN --max --cpu 64 \ | ||
-o $CMSCAN_PATH/$NEW_RELEASE_DATE-nohits.o \ | ||
--tbl $CMSCAN_PATH/$NEW_RELEASE_DATE-nohits.tbl \ | ||
$CMDB \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/$NEW_RELEASE_DATE.fasta | ||
$FASTA_PATH/$NEW_RELEASE_DATE-nohits.fasta | ||
echo "Finished cmscan (no hits)." | ||
|
||
# run cluster | ||
echo "Clustering..." | ||
python -m rna3db cluster \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/filter.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/cluster.json \ | ||
--tbl_dir $OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
$JSON_PATH/filter.json \ | ||
$JSON_PATH/cluster.json \ | ||
--tbl_dir $CMSCAN_PATH | ||
echo "Cleaning up..." | ||
rm -r $JSON_PATH/mmseqs2_* | ||
echo "Finished clustering." | ||
|
||
# run split | ||
echo "Splitting..." | ||
python -m rna3db split \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/cluster.json \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/split.json | ||
$JSON_PATH/cluster.json \ | ||
$JSON_PATH/split.json | ||
echo "Finished splitting." | ||
|
||
# make mmCIFs | ||
echo "Making mmCIFs..." | ||
python scripts/json_to_mmcif.py \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-jsons/split.json \ | ||
/Users/marcell/Documents/rna3db/data/pdb_mmcif/ \ | ||
$MMCIF_DIR \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
$JSON_PATH/split.json \ | ||
$PDB_MMCIF_DIR \ | ||
$MMCIF_PATH | ||
echo "Finished writing mmCIFs." | ||
|
||
# compress files ready for release | ||
tar -czvf \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans.tar.gz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
tar -czvf \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans.tar.gz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-cmscans | ||
tar -cfvJ \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs.tar.xz \ | ||
$OUTPUT_DIR/$NEW_RELEASE_DATE/rna3db-mmcifs | ||
echo "Compressing..." | ||
tar -czf \ | ||
$UPLOAD_PATH/rna3db-cmscans.tar.gz \ | ||
$CMSCAN_PATH | ||
tar -czf \ | ||
$UPLOAD_PATH/rna3db-jsons.tar.gz \ | ||
$JSON_PATH | ||
tar -cJf \ | ||
$UPLOAD_PATH/rna3db-mmcifs.tar.xz \ | ||
$MMCIF_PATH | ||
|
||
echo "Finished." | ||
echo $NEW_RELEASE_DATE "release ready for upload." | ||
|