From acf11472318b6b0c4d2e4201a6cce415e9a6cea4 Mon Sep 17 00:00:00 2001 From: ramari16 Date: Fri, 6 Sep 2024 14:59:10 -0400 Subject: [PATCH] ALS-7014: Implement signed URL functionality for data exports (#119) * ALS-7165: Update table name to include dataset id --- data/pom.xml | 11 +++ pom.xml | 16 +++- .../avillach/hpds/processing/AsyncResult.java | 4 + .../hpds/processing/ResultStoreStream.java | 4 + .../hpds/processing/io/PfbWriter.java | 27 ++++-- .../processing/upload/SignUrlService.java | 84 +++++++++++++++++++ .../hpds/processing/io/PfbWriterTest.java | 9 +- .../avillach/hpds/service/PicSureService.java | 44 ++++++---- .../avillach/hpds/service/QueryService.java | 5 +- .../resources/application-bdc-auth.properties | 6 +- 10 files changed, 179 insertions(+), 31 deletions(-) create mode 100644 processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/upload/SignUrlService.java diff --git a/data/pom.xml b/data/pom.xml index 20dee146..85c96dad 100644 --- a/data/pom.xml +++ b/data/pom.xml @@ -49,6 +49,17 @@ org.projectlombok lombok + + software.amazon.awssdk + s3 + + + + commons-logging + commons-logging + + + diff --git a/pom.xml b/pom.xml index 9754c284..6b0e331e 100644 --- a/pom.xml +++ b/pom.xml @@ -28,6 +28,7 @@ UTF-8 1.4.10 + 2.20.153 @@ -199,7 +200,7 @@ edu.harvard.hms.dbmi.avillach pic-sure-resource-api - 2.1.0-SNAPSHOT + 2.2.0-SNAPSHOT org.apache.logging.log4j @@ -320,7 +321,18 @@ snappy-java 1.1.10.5 - + + software.amazon.awssdk + s3 + ${aws.version} + + + + commons-logging + commons-logging + + + diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AsyncResult.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AsyncResult.java index 7312b6f9..70e9d38a 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AsyncResult.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/AsyncResult.java @@ -43,6 +43,10 @@ public MediaType getResponseType() { return responseType; } + public File getFile() { + return stream.getFile(); + } + public static enum Status{ SUCCESS { @Override diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/ResultStoreStream.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/ResultStoreStream.java index d0ffc2f7..bc826817 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/ResultStoreStream.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/ResultStoreStream.java @@ -117,4 +117,8 @@ public long estimatedSize() { public void closeWriter() { writer.close(); } + + public File getFile() { + return writer.getFile(); + } } diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriter.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriter.java index 9f09bbd8..b10fc08c 100644 --- a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriter.java +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriter.java @@ -2,7 +2,6 @@ import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; -import org.apache.avro.file.Codec; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; @@ -21,10 +20,15 @@ public class PfbWriter implements ResultWriter { + public static final String PATIENT_TABLE_PREFIX = "pic-sure-"; private Logger log = LoggerFactory.getLogger(PfbWriter.class); private final Schema metadataSchema; private final Schema nodeSchema; + + private final String queryId; + + private final String patientTableName; private SchemaBuilder.FieldAssembler entityFieldAssembler; private List fields; @@ -32,11 +36,14 @@ public class PfbWriter implements ResultWriter { private File file; private Schema entitySchema; private Schema patientDataSchema; + private Schema relationSchema; private static final Set SINGULAR_FIELDS = Set.of("patient_id"); - public PfbWriter(File tempFile) { - file = tempFile; + public PfbWriter(File tempFile, String queryId) { + this.file = tempFile; + this.queryId = queryId; + this.patientTableName = formatFieldName(PATIENT_TABLE_PREFIX + queryId); entityFieldAssembler = SchemaBuilder.record("entity") .namespace("edu.harvard.dbmi") .fields(); @@ -53,12 +60,19 @@ public PfbWriter(File tempFile) { metadataRecord.requiredString("misc"); metadataRecord = metadataRecord.name("nodes").type(SchemaBuilder.array().items(nodeSchema)).noDefault(); metadataSchema = metadataRecord.endRecord(); + + + SchemaBuilder.FieldAssembler relationRecord = SchemaBuilder.record("Relation") + .fields() + .requiredString("dst_name") + .requiredString("dst_id"); + relationSchema = relationRecord.endRecord(); } @Override public void writeHeader(String[] data) { fields = Arrays.stream(data.clone()).map(this::formatFieldName).collect(Collectors.toList()); - SchemaBuilder.FieldAssembler patientRecords = SchemaBuilder.record("patientData") + SchemaBuilder.FieldAssembler patientRecords = SchemaBuilder.record(patientTableName) .fields(); fields.forEach(field -> { @@ -76,6 +90,7 @@ public void writeHeader(String[] data) { entityFieldAssembler = entityFieldAssembler.name("object").type(objectSchema).noDefault(); entityFieldAssembler.nullableString("id", "null"); entityFieldAssembler.requiredString("name"); + entityFieldAssembler = entityFieldAssembler.name("relations").type(SchemaBuilder.array().items(relationSchema)).noDefault(); entitySchema = entityFieldAssembler.endRecord(); DatumWriter datumWriter = new GenericDatumWriter(entitySchema); @@ -126,6 +141,7 @@ private void writeMetadata() { entityRecord.put("object", metadata); entityRecord.put("name", "metadata"); entityRecord.put("id", "null"); + entityRecord.put("relations", List.of()); try { dataFileWriter.append(entityRecord); @@ -163,8 +179,9 @@ public void writeMultiValueEntity(Collection>> entities) { GenericRecord entityRecord = new GenericData.Record(entitySchema); entityRecord.put("object", patientData); - entityRecord.put("name", "patientData"); + entityRecord.put("name", patientTableName); entityRecord.put("id", patientId); + entityRecord.put("relations", List.of()); try { dataFileWriter.append(entityRecord); diff --git a/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/upload/SignUrlService.java b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/upload/SignUrlService.java new file mode 100644 index 00000000..f80ddce3 --- /dev/null +++ b/processing/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/processing/upload/SignUrlService.java @@ -0,0 +1,84 @@ +package edu.harvard.hms.dbmi.avillach.hpds.processing.upload; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; +import software.amazon.awssdk.core.sync.RequestBody; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.PutObjectRequest; +import software.amazon.awssdk.services.s3.presigner.S3Presigner; +import software.amazon.awssdk.services.s3.presigner.model.GetObjectPresignRequest; +import software.amazon.awssdk.services.s3.presigner.model.PresignedGetObjectRequest; + +import java.io.File; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; + +@Component +public class SignUrlService { + + private final String bucketName; + private final int signedUrlExpiryMinutes; + private final Region region; + + private static Logger log = LoggerFactory.getLogger(SignUrlService.class); + + @Autowired + public SignUrlService( + @Value("${data-export.s3.bucket-name:}") String bucketName, + @Value("${data-export.s3.region:us-east-1}") String region, + @Value("${data-export.s3.signedUrl-expiry-minutes:60}") int signedUrlExpiryMinutes + ) { + this.bucketName = bucketName; + this.signedUrlExpiryMinutes = signedUrlExpiryMinutes; + this.region = Region.of(region); + } + + public void uploadFile(File file, String objectKey) { + S3Client s3 = S3Client.builder() + .region(region) + .build(); + + putS3Object(s3, bucketName, objectKey, file); + s3.close(); + } + + // This example uses RequestBody.fromFile to avoid loading the whole file into + // memory. + public void putS3Object(S3Client s3, String bucketName, String objectKey, File file) { + Map metadata = new HashMap<>(); + PutObjectRequest putOb = PutObjectRequest.builder() + .bucket(bucketName) + .key(objectKey) + .metadata(metadata) + .build(); + + s3.putObject(putOb, RequestBody.fromFile(file)); + log.info("Successfully placed " + objectKey + " into bucket " + bucketName); + } + + public String createPresignedGetUrl(String keyName) { + PresignedGetObjectRequest presignedRequest; + try (S3Presigner presigner = S3Presigner.builder().region(region).build()) { + GetObjectRequest objectRequest = GetObjectRequest.builder() + .bucket(bucketName) + .key(keyName) + .build(); + + GetObjectPresignRequest presignRequest = GetObjectPresignRequest.builder() + .signatureDuration(Duration.ofMinutes(signedUrlExpiryMinutes)) // The URL will expire in 10 minutes. + .getObjectRequest(objectRequest) + .build(); + + presignedRequest = presigner.presignGetObject(presignRequest); + } + log.info("Presigned URL: [{}]", presignedRequest.url().toString()); + + return presignedRequest.url().toExternalForm(); + } +} diff --git a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriterTest.java b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriterTest.java index 7e18d1a7..d1819741 100644 --- a/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriterTest.java +++ b/processing/src/test/java/edu/harvard/hms/dbmi/avillach/hpds/processing/io/PfbWriterTest.java @@ -7,6 +7,7 @@ import java.io.File; import java.util.ArrayList; import java.util.List; +import java.util.UUID; import static org.junit.jupiter.api.Assertions.*; @@ -15,7 +16,7 @@ public class PfbWriterTest { @Test public void writeValidPFB() { - PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro")); + PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro"), UUID.randomUUID().toString()); pfbWriter.writeHeader(new String[] {"patient_id", "\\demographics\\age\\", "\\phs123\\stroke\\"}); List> nullableList = new ArrayList<>(); @@ -38,21 +39,21 @@ public void writeValidPFB() { @Test public void formatFieldName_spacesAndBackslashes_replacedWithUnderscore() { - PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro")); + PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro"), UUID.randomUUID().toString()); String formattedName = pfbWriter.formatFieldName("\\Topmed Study Accession with Subject ID\\\\"); assertEquals("_Topmed_Study_Accession_with_Subject_ID__", formattedName); } @Test public void formatFieldName_startsWithDigit_prependUnderscore() { - PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro")); + PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro"), UUID.randomUUID().toString()); String formattedName = pfbWriter.formatFieldName("123Topmed Study Accession with Subject ID\\\\"); assertEquals("_123Topmed_Study_Accession_with_Subject_ID__", formattedName); } @Test public void formatFieldName_randomGarbage_replaceWithUnderscore() { - PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro")); + PfbWriter pfbWriter = new PfbWriter(new File("target/test-result.avro"), UUID.randomUUID().toString()); String formattedName = pfbWriter.formatFieldName("$$$my garbage @vro var!able nam#"); assertEquals("___my_garbage__vro_var_able_nam_", formattedName); } diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java index a32839ad..1d0f8c2e 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/PicSureService.java @@ -1,14 +1,14 @@ package edu.harvard.hms.dbmi.avillach.hpds.service; -import java.io.ByteArrayInputStream; +import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.Map.Entry; import java.util.stream.Collectors; import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.InfoColumnMeta; +import edu.harvard.hms.dbmi.avillach.hpds.processing.upload.SignUrlService; import edu.harvard.hms.dbmi.avillach.hpds.service.util.Paginator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,7 +28,6 @@ import edu.harvard.dbmi.avillach.domain.*; import edu.harvard.dbmi.avillach.util.UUIDv5; import edu.harvard.hms.dbmi.avillach.hpds.crypto.Crypto; -import edu.harvard.hms.dbmi.avillach.hpds.data.genotype.FileBackedByteIndexedInfoStore; import edu.harvard.hms.dbmi.avillach.hpds.data.phenotype.ColumnMeta; import edu.harvard.hms.dbmi.avillach.hpds.data.query.Query; import edu.harvard.hms.dbmi.avillach.hpds.processing.*; @@ -41,13 +40,15 @@ public class PicSureService { @Autowired public PicSureService(QueryService queryService, TimelineProcessor timelineProcessor, CountProcessor countProcessor, - VariantListProcessor variantListProcessor, AbstractProcessor abstractProcessor, Paginator paginator) { + VariantListProcessor variantListProcessor, AbstractProcessor abstractProcessor, Paginator paginator, + SignUrlService signUrlService) { this.queryService = queryService; this.timelineProcessor = timelineProcessor; this.countProcessor = countProcessor; this.variantListProcessor = variantListProcessor; this.abstractProcessor = abstractProcessor; this.paginator = paginator; + this.signUrlService = signUrlService; Crypto.loadDefaultKey(); } @@ -67,6 +68,8 @@ public PicSureService(QueryService queryService, TimelineProcessor timelineProce private final Paginator paginator; + private final SignUrlService signUrlService; + private static final String QUERY_METADATA_FIELD = "queryMetadata"; private static final int RESPONSE_CACHE_SIZE = 50; @@ -213,19 +216,7 @@ private QueryStatus convertToQueryStatus(AsyncResult entity) { public ResponseEntity queryResult(@PathVariable("resourceQueryId") UUID queryId, @RequestBody QueryRequest resultRequest) throws IOException { AsyncResult result = queryService.getResultFor(queryId.toString()); if (result == null) { - // This happens sometimes when users immediately request the status for a query - // before it can be initialized. We wait a bit and try again before throwing an - // error. - try { - Thread.sleep(100); - } catch (InterruptedException e) { - return ResponseEntity.status(500).build(); - } - - result = queryService.getResultFor(queryId.toString()); - if (result == null) { - return ResponseEntity.status(404).build(); - } + return ResponseEntity.status(404).build(); } if (result.getStatus() == AsyncResult.Status.SUCCESS) { result.open(); @@ -237,6 +228,25 @@ public ResponseEntity queryResult(@PathVariable("resourceQueryId") UUID queryId, } } + @PostMapping(value = "/query/{resourceQueryId}/signed-url") + public ResponseEntity querySignedURL(@PathVariable("resourceQueryId") UUID queryId, @RequestBody QueryRequest resultRequest) throws IOException { + AsyncResult result = queryService.getResultFor(queryId.toString()); + if (result == null) { + return ResponseEntity.status(404).build(); + } + if (result.getStatus() == AsyncResult.Status.SUCCESS) { + File file = result.getFile(); + signUrlService.uploadFile(file, file.getName()); + String presignedGetUrl = signUrlService.createPresignedGetUrl(file.getName()); + log.info("Presigned url: " + presignedGetUrl); + return ResponseEntity.ok() + .contentType(MediaType.APPLICATION_JSON) + .body(new SignedUrlResponse(presignedGetUrl)); + } else { + return ResponseEntity.status(400).body("Status : " + result.getStatus().name()); + } + } + @PostMapping("/query/{resourceQueryId}/status") public QueryStatus queryStatus(@PathVariable("resourceQueryId") UUID queryId, @RequestBody QueryRequest request) { return convertToQueryStatus(queryService.getStatusFor(queryId.toString())); diff --git a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/QueryService.java b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/QueryService.java index 5fe1a0a9..a41a94d2 100644 --- a/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/QueryService.java +++ b/service/src/main/java/edu/harvard/hms/dbmi/avillach/hpds/service/QueryService.java @@ -135,9 +135,10 @@ private AsyncResult initializeResult(Query query) throws ClassNotFoundException, throw new RuntimeException("UNSUPPORTED RESULT TYPE"); } + String queryId = UUIDv5.UUIDFromString(query.toString()).toString(); ResultWriter writer; if (ResultType.DATAFRAME_PFB.equals(query.getExpectedResultType())) { - writer = new PfbWriter(File.createTempFile("result-" + System.nanoTime(), ".avro")); + writer = new PfbWriter(File.createTempFile("result-" + System.nanoTime(), ".avro"), queryId); } else { writer = new CsvWriter(File.createTempFile("result-" + System.nanoTime(), ".sstmp")); } @@ -145,7 +146,7 @@ private AsyncResult initializeResult(Query query) throws ClassNotFoundException, AsyncResult result = new AsyncResult(query, p, writer) .setStatus(AsyncResult.Status.PENDING) .setQueuedTime(System.currentTimeMillis()) - .setId(UUIDv5.UUIDFromString(query.toString()).toString()); + .setId(queryId); query.setId(result.getId()); results.put(result.getId(), result); return result; diff --git a/service/src/main/resources/application-bdc-auth.properties b/service/src/main/resources/application-bdc-auth.properties index 860f5d64..272b84ec 100644 --- a/service/src/main/resources/application-bdc-auth.properties +++ b/service/src/main/resources/application-bdc-auth.properties @@ -3,4 +3,8 @@ SMALL_TASK_THREADS = 1 LARGE_TASK_THREADS = 1 hpds.genomicProcessor.impl=localPatientDistributed -HPDS_GENOMIC_DATA_DIRECTORY=/opt/local/hpds/all/ \ No newline at end of file +HPDS_GENOMIC_DATA_DIRECTORY=/opt/local/hpds/all/ + +data-export.s3.bucket-name=pic-sure-auth-dev-data-export +data-export.s3.region=us-east-1 +data-export.s3.signedUrl-expiry-minutes=30 \ No newline at end of file