diff --git a/.env.example b/.env.example index 007959309..0a41729d7 100755 --- a/.env.example +++ b/.env.example @@ -142,4 +142,17 @@ CROSSREF_NOTIFY_ENDPOINT=endpoint # Automated metadata AUTOMATED_METADATA_API=http://host.docker.internal:5005 -AUTOMATED_METADATA_API_KEY= \ No newline at end of file +AUTOMATED_METADATA_API_KEY= + +# Elastic Search, required for /v1/search endpoints +ELASTIC_SEARCH_NODE_URL= +ELASTIC_SEARCH_USER= +ELASTIC_SEARCH_PW= + +# Elastic search local dev node configuration +ES_NODE=http://host.docker.internal:9200 +ES_DB_HOST= +ES_DB_PORT= +ES_DB_NAME= +ES_DB_USER= +ES_DB_PASSWORD= diff --git a/.github/workflows/build-and-test.yaml b/.github/workflows/build-and-test.yaml index 956ee3061..ca44c8ba7 100644 --- a/.github/workflows/build-and-test.yaml +++ b/.github/workflows/build-and-test.yaml @@ -50,6 +50,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info diff --git a/.github/workflows/build-server.yaml b/.github/workflows/build-server.yaml index ebef63f23..a064e4854 100644 --- a/.github/workflows/build-server.yaml +++ b/.github/workflows/build-server.yaml @@ -52,6 +52,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info diff --git a/.github/workflows/models-build-and-test.yaml b/.github/workflows/models-build-and-test.yaml index 172b2156c..63d25cbb0 100644 --- a/.github/workflows/models-build-and-test.yaml +++ b/.github/workflows/models-build-and-test.yaml @@ -35,6 +35,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info diff --git a/desci-elastic/Dockerfile-logstash b/desci-elastic/Dockerfile-logstash new file mode 100644 index 000000000..2b5dd33e0 --- /dev/null +++ b/desci-elastic/Dockerfile-logstash @@ -0,0 +1,15 @@ +FROM docker.elastic.co/logstash/logstash:8.14.3 + +USER root + +# Install curl +RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* + +# Copy the initialization script +COPY desci-elastic/init-logstash.sh /usr/local/bin/init-logstash.sh +RUN chmod +x /usr/local/bin/init-logstash.sh + +USER logstash + +# Set the entrypoint to the initialization script +ENTRYPOINT ["/usr/local/bin/init-logstash.sh"] \ No newline at end of file diff --git a/desci-elastic/init-logstash.sh b/desci-elastic/init-logstash.sh new file mode 100644 index 000000000..d613fb2b3 --- /dev/null +++ b/desci-elastic/init-logstash.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +# Configuration +DRIVER_URL="https://jdbc.postgresql.org/download/postgresql-42.7.3.jar" +DRIVER_DIR="/opt/logstash/drivers" +DRIVER_FILE="$DRIVER_DIR/postgresql-42.7.3.jar" + +# Ensure the driver directory exists +mkdir -p "$DRIVER_DIR" + + +download_driver() { + echo "Downloading PostgreSQL JDBC driver..." + curl -# -o "$DRIVER_FILE" "$DRIVER_URL" + chmod 644 "$DRIVER_FILE" + echo "Driver downloaded and permissions set." +} + +# Check if driver exists and download if necessary +if [ -f "$DRIVER_FILE" ]; then + echo "PostgreSQL JDBC driver already exists." +else + download_driver +fi + +# Verify the driver file +if [ ! -f "$DRIVER_FILE" ]; then + echo "Error: Failed to download or locate the PostgreSQL JDBC driver." + exit 1 +fi + +# Ensure correct permissions on the driver file +chmod 644 "$DRIVER_FILE" + +# Start Logstash with the provided pipeline configuration +exec logstash -f /usr/share/logstash/pipeline/logstash.conf \ No newline at end of file diff --git a/desci-elastic/logstash-authors.conf b/desci-elastic/logstash-authors.conf new file mode 100644 index 000000000..5a9d9b3cc --- /dev/null +++ b/desci-elastic/logstash-authors.conf @@ -0,0 +1,62 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + id::text, + orcid::text, + display_name::text, + display_name_alternatives::text, + works_count::text, + cited_by_count::text, + last_known_institution::text, + works_api_url::text, + updated_date::text + FROM openalex.authors + WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY updated_date ASC, id ASC + LIMIT 1000 + " + use_column_value => true + tracking_column => "updated_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + jdbc_paging_enabled => true + jdbc_page_size => 1000 + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "display_name_alternatives" + target => "display_name_alternatives" + skip_on_invalid_json => true + } + json { + source => "last_known_institution" + target => "last_known_institution" + skip_on_invalid_json => true + } + mutate { + convert => { + "works_count" => "integer" + "cited_by_count" => "integer" + } + } +} + +output { + elasticsearch { + hosts => ["${ES_NODE}"] + index => "authors" + document_id => "%{id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-elastic/logstash-works-authorships.conf b/desci-elastic/logstash-works-authorships.conf new file mode 100644 index 000000000..3279d3a0f --- /dev/null +++ b/desci-elastic/logstash-works-authorships.conf @@ -0,0 +1,37 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + work_id, + author_id, + author_position, + raw_affiliation_string, + institution_id + FROM openalex.works_authorships + ORDER BY work_id ASC + LIMIT 10000 + " + jdbc_paging_enabled => true + jdbc_page_size => 1000 + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } +} + +output { + elasticsearch { + hosts => ["${ES_NODE}"] + index => "works_authorships" + document_id => "%{work_id}-%{author_id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-elastic/logstash-works.conf b/desci-elastic/logstash-works.conf new file mode 100644 index 000000000..12c51fe9b --- /dev/null +++ b/desci-elastic/logstash-works.conf @@ -0,0 +1,64 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + id::TEXT, + orcid::TEXT, + display_name::TEXT, + display_name_alternatives::TEXT, + works_count::TEXT, + cited_by_count::TEXT, + last_known_institution::TEXT, + works_api_url::TEXT, + updated_date::TEXT + FROM openalex.authors + WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY updated_date ASC, id ASC + LIMIT 1000 + " + use_column_value => true + tracking_column => "updated_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + jdbc_paging_enabled => true + jdbc_page_size => 1000 + codec => json + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "display_name_alternatives" + target => "display_name_alternatives" + skip_on_invalid_json => true + } + json { + source => "last_known_institution" + target => "last_known_institution" + skip_on_invalid_json => true + } + mutate { + convert => { + "works_count" => "integer" + "cited_by_count" => "integer" + } + } +} + +output { + stdout { codec => json } + elasticsearch { + hosts => ["${ES_NODE}"] + index => "authors" + document_id => "%{id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-elastic/logstash.conf b/desci-elastic/logstash.conf new file mode 100644 index 000000000..aa484f804 --- /dev/null +++ b/desci-elastic/logstash.conf @@ -0,0 +1,98 @@ +# Imports a denormalized works table including the authors table and the works_authorships join table +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + jdbc_paging_enabled => true + jdbc_page_size => 100 + use_column_value => true + tracking_column => "publication_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + statement => " + SELECT + w.id::TEXT AS work_id, + w.doi::TEXT, + w.title::TEXT, + w.publication_year::TEXT, + w.type::TEXT, + w.cited_by_count::TEXT AS cited_by_count, + w.abstract_inverted_index::TEXT as abstract_inverted_index, + w.publication_date::TIMESTAMP AS publication_date + FROM openalex.works w + WHERE w.publication_date::TIMESTAMP > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY w.publication_date::TIMESTAMP ASC, w.id ASC + LIMIT 100 + " + codec => json + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "abstract_inverted_index" + target => "abstract_inverted_index_parsed" + } + ruby { + code => ' + abstract_inverted_index = event.get("abstract_inverted_index_parsed") + if abstract_inverted_index + abstract_length = abstract_inverted_index.values.flatten.max + 1 + abstract_words = Array.new(abstract_length, "") + abstract_inverted_index.each do |word, positions| + positions.each do |position| + abstract_words[position] = word + end + end + abstract = abstract_words.join(" ") + event.set("abstract", abstract) + end + ' + } + mutate { + remove_field => ["abstract_inverted_index", "abstract_inverted_index_parsed"] + convert => { + "cited_by_count" => "integer" + "publication_year" => "integer" + } + } + jdbc_streaming { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + a.id AS author_id, + wa.author_position, + a.display_name AS author_name, + a.works_count AS author_works_count, + a.cited_by_count AS author_cited_by_count, + wa.institution_id, + a.orcid + FROM openalex.works_authorships wa + JOIN openalex.authors a ON wa.author_id = a.id + WHERE wa.work_id = :work_id + ORDER BY wa.author_position ASC + " + parameters => { "work_id" => "work_id" } + target => "authors" + } +} + +output { + stdout { codec => json } + elasticsearch { + hosts => ["${ES_NODE}"] + index => "denormalized_works_test2" + document_id => "%{[work_id]}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-server/kubernetes/deployment_dev.yaml b/desci-server/kubernetes/deployment_dev.yaml index 3c083523b..ab583228d 100644 --- a/desci-server/kubernetes/deployment_dev.yaml +++ b/desci-server/kubernetes/deployment_dev.yaml @@ -90,6 +90,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}" export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}" + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export DEBUG_TEST=0; echo "appfinish"; {{- end -}} diff --git a/desci-server/kubernetes/deployment_prod.yaml b/desci-server/kubernetes/deployment_prod.yaml index 794d45908..b7f2db79e 100755 --- a/desci-server/kubernetes/deployment_prod.yaml +++ b/desci-server/kubernetes/deployment_prod.yaml @@ -90,6 +90,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}" export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}" + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export IGNORE_LINE=0; export DEBUG_TEST=0; echo "appfinish"; diff --git a/desci-server/kubernetes/deployment_staging.yaml b/desci-server/kubernetes/deployment_staging.yaml index bd3591b8f..81f187e2d 100644 --- a/desci-server/kubernetes/deployment_staging.yaml +++ b/desci-server/kubernetes/deployment_staging.yaml @@ -102,6 +102,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API={{ .Data.AUTOMATED_METADATA_API }} export AUTOMATED_METADATA_API_KEY={{ .Data.AUTOMATED_METADATA_API_KEY }} + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export DEBUG_TEST=0; echo "appfinish"; {{- end -}} diff --git a/desci-server/package.json b/desci-server/package.json index 34374a8e1..e38dc8a66 100755 --- a/desci-server/package.json +++ b/desci-server/package.json @@ -59,6 +59,7 @@ "@desci-labs/desci-codex-lib": "^1.1.7", "@desci-labs/desci-contracts": "^0.2.6", "@desci-labs/desci-models": "0.2.9", + "@elastic/elasticsearch": "^8.14.0", "@honeycombio/opentelemetry-node": "^0.3.2", "@ipld/dag-pb": "^4.0.0", "@opentelemetry/api": "^1.8.0", diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts new file mode 100644 index 000000000..401a82dbf --- /dev/null +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -0,0 +1,99 @@ +import { Request, Response } from 'express'; + +import { elasticClient } from '../../elasticSearchClient.js'; +import { logger as parentLogger } from '../../logger.js'; +import { + buildBoolQuery, + buildMultiMatchQuery, + buildSortQuery, + DENORMALIZED_WORKS_INDEX, + VALID_ENTITIES, +} from '../../services/ElasticSearchService.js'; + +import { QueryDebuggingResponse, SingleQueryErrorResponse, SingleQuerySuccessResponse } from './query.js'; + +type Entity = string; +type Query = string; + +type QueryObject = Record; + +interface MultiQuerySearchParams { + queries: QueryObject[]; + fuzzy?: number; + sortType?: string; + sortOrder?: 'asc' | 'desc'; + page?: number; + perPage?: number; +} + +export const multiQuery = async ( + req: Request, + res: Response<(SingleQuerySuccessResponse & QueryDebuggingResponse) | SingleQueryErrorResponse>, +) => { + const { + queries, + fuzzy, + sortType = 'relevance', + sortOrder, + page = 1, + perPage = 10, + }: MultiQuerySearchParams = req.body; + const logger = parentLogger.child({ + module: 'SEARCH::MultiQuery', + queries, + fuzzy, + sortType, + sortOrder, + page, + perPage, + }); + + logger.trace({ fn: 'Executing elastic search query' }); + + const validEntityQueries = queries.filter((q) => VALID_ENTITIES.includes(Object.keys(q)[0])); + + if (!validEntityQueries) { + return res.status(400).json({ + ok: false, + error: `Invalid queries, the following entities are supported: ${VALID_ENTITIES.join(' ')}`, + }); + } + + const esQueries = validEntityQueries.map((q) => { + const [entity, query] = Object.entries(q)[0]; + return buildMultiMatchQuery(query, entity); + }); + const primaryEntity = Object.keys(validEntityQueries[0])[0]; + const esSort = buildSortQuery(DENORMALIZED_WORKS_INDEX, sortType, sortOrder); + const esBoolQuery = buildBoolQuery(esQueries); + + try { + logger.debug({ esQueries, esSort }, 'Executing query'); + const { hits } = await elasticClient.search({ + index: DENORMALIZED_WORKS_INDEX, + body: { + ...esBoolQuery, + sort: esSort, + from: (page - 1) * perPage, + size: perPage, + }, + }); + + logger.info({ fn: 'Elastic search multi query executed successfully' }); + + return res.json({ + // esQueries, + ok: true, + total: hits.total, + page, + perPage, + data: hits.hits, + }); + } catch (error) { + logger.error({ error }, 'Elastic search multi query failed'); + return res.status(500).json({ + ok: false, + error: 'An error occurred while searching', + }); + } +}; diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts new file mode 100644 index 000000000..a9cce97a4 --- /dev/null +++ b/desci-server/src/controllers/search/query.ts @@ -0,0 +1,112 @@ +import { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types.js'; +import { Request, Response } from 'express'; + +import { elasticClient } from '../../elasticSearchClient.js'; +import { logger as parentLogger } from '../../logger.js'; +import { + buildBoolQuery, + buildMultiMatchQuery, + buildSimpleStringQuery, + buildSortQuery, + DENORMALIZED_WORKS_INDEX, + VALID_ENTITIES, +} from '../../services/ElasticSearchService.js'; + +export interface SingleQuerySuccessResponse extends QueryDebuggingResponse { + ok: true; + page: number; + perPage: number; + total: number | SearchTotalHits; + data: any[]; +} + +export interface QueryDebuggingResponse { + esQuery?: any; + esQueries?: any; + esSort?: any; +} + +export interface SingleQueryErrorResponse extends QueryDebuggingResponse { + ok: false; + error: string; +} + +interface QuerySearchBodyParams { + query: string; + entity: string; + fuzzy?: number; + sortType?: string; + sortOrder?: 'asc' | 'desc'; + page?: number; + perPage?: number; +} + +export const singleQuery = async ( + req: Request, + res: Response, +) => { + const { query, fuzzy, sortType = 'relevance', sortOrder, page = 1, perPage = 10 }: QuerySearchBodyParams = req.body; + + let { entity } = req.body; + + const logger = parentLogger.child({ + module: 'SEARCH::Query', + query, + entity, + fuzzy, + sortType, + sortOrder, + page, + perPage, + }); + if (entity === 'works') { + logger.info({ entity }, `Entity is 'works', changing to denormalized works index: ${DENORMALIZED_WORKS_INDEX}`); + entity = DENORMALIZED_WORKS_INDEX; + } + // + logger.trace({ fn: 'Executing elastic search query' }); + + if (!VALID_ENTITIES.includes(entity)) { + return res.status(400).json({ + ok: false, + error: `Invalid entity: ${entity}, the following entities are supported: ${VALID_ENTITIES.join(' ')}`, + }); + } + + // const esQuery = buildSimpleStringQuery(query, entity, fuzzy); + const esQuery = buildMultiMatchQuery(query, 'works_single', fuzzy); + const esSort = buildSortQuery(entity, sortType, sortOrder); + + try { + logger.debug({ esQuery, esSort }, 'Executing query'); + const results = await elasticClient.search({ + index: entity, + body: { + query: esQuery, + sort: esSort, + from: (page - 1) * perPage, + size: perPage, + }, + }); + const hits = results.hits; + logger.info({ fn: 'Elastic search query executed successfully' }); + + return res.json({ + esQuery, + esSort, + ok: true, + total: hits.total, + page, + perPage, + data: hits.hits, + }); + } catch (error) { + logger.error({ error }, 'Elastic search query failed'); + return res.status(500).json({ + ok: false, + error: 'An error occurred while searching', + esQuery, + esSort, + }); + } +}; diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts new file mode 100644 index 000000000..aa2a6aafa --- /dev/null +++ b/desci-server/src/elasticSearchClient.ts @@ -0,0 +1,31 @@ +import { Client } from '@elastic/elasticsearch'; + +const esNodeUrl = process.env.ELASTIC_SEARCH_NODE_URL; +const esUser = process.env.ELASTIC_SEARCH_USER; +const esPw = process.env.ELASTIC_SEARCH_PW; + +if (!esNodeUrl || !esUser || !esPw) { + console.error('Missing environment variables for ElasticSearch'); +} + +const esAuthConfig = + !esNodeUrl?.includes('host.docker.internal') && esUser && esPw + ? { + // Auth unnecessary if running local ES node + auth: { + username: esUser, + password: esPw, + }, + } + : {}; + +export const elasticClient = + esNodeUrl && esUser && esPw + ? new Client({ + node: esNodeUrl, + ...esAuthConfig, + tls: { + rejectUnauthorized: false, // Temporary + }, + }) + : ({} as any); diff --git a/desci-server/src/routes/v1/index.ts b/desci-server/src/routes/v1/index.ts index 6fd4a4b07..a87e0ebae 100755 --- a/desci-server/src/routes/v1/index.ts +++ b/desci-server/src/routes/v1/index.ts @@ -21,6 +21,7 @@ import log from './log.js'; import nodes from './nodes.js'; import pub from './pub.js'; import referral from './referral.js'; +import search from './search.js'; import services from './services.js'; import users from './users.js'; import waitlist from './waitlist.js'; @@ -58,6 +59,7 @@ router.use('/services', services); router.use('/communities', communities); router.use('/attestations', attestations); router.use('/doi', doi); +router.use('/search', search); router.get('/nft/:id', nft); router.use('/referral', referral); diff --git a/desci-server/src/routes/v1/search.ts b/desci-server/src/routes/v1/search.ts new file mode 100644 index 000000000..4e88713cc --- /dev/null +++ b/desci-server/src/routes/v1/search.ts @@ -0,0 +1,12 @@ +import { Router } from 'express'; + +import { multiQuery } from '../../controllers/search/multiQuery.js'; +import { singleQuery } from '../../controllers/search/query.js'; +import { ensureUser } from '../../internal.js'; + +const router = Router(); + +router.post('/multi', [ensureUser], multiQuery); +router.post('/', [ensureUser], singleQuery); + +export default router; diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts new file mode 100644 index 000000000..7fc611f64 --- /dev/null +++ b/desci-server/src/services/ElasticSearchService.ts @@ -0,0 +1,173 @@ +import { + QueryDslFunctionBoostMode, + QueryDslQueryContainer, + QueryDslTextQueryType, +} from '@elastic/elasticsearch/lib/api/types.js'; + +export const DENORMALIZED_WORKS_INDEX = 'denormalized_works_test_2024_08_01'; +export const VALID_ENTITIES = [ + 'authors', + 'concepts', + 'institutions', + 'publishers', + 'sources', + 'topics', + 'works', + DENORMALIZED_WORKS_INDEX, +]; + +/** + * Ordered from most relevant to least relevant + */ +export const RELEVANT_FIELDS = { + works: ['title', 'abstract', 'doi'], + authors: ['display_name', 'orcid', 'last_known_institution'], + denorm_authors: ['authors.author_name', 'authors.orcid', 'authors.last_known_institution'], + works_single: [ + 'title^1.25', + 'abstract', + 'doi', + 'authors.author_name', + 'authors.orcid', + 'authors.last_known_institution', + ], +}; + +type SortOrder = 'asc' | 'desc'; +type SortField = { [field: string]: { order: SortOrder; missing?: string } }; + +const baseSort: SortField[] = [{ _score: { order: 'desc' } }]; + +const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) => SortField[] } } = { + works: { + publication_year: (order) => [{ publication_year: { order, missing: '_last' } }], + publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], + relevance: () => [], + }, + authors: { + display_name: (order) => [{ 'display_name.keyword': { order, missing: '_last' } }], + works_count: (order) => [{ works_count: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + updated_date: (order) => [{ updated_date: { order, missing: '_last' } }], + relevance: () => [], + }, + [DENORMALIZED_WORKS_INDEX]: { + publication_year: (order) => [{ publication_year: { order, missing: '_last' } }], + publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], + author_name: (order) => [{ 'authors.author_name.keyword': { order, missing: '_last' } }], + relevance: () => [], + }, +}; + +export function scoreBoostFunction(query: Record<'multi_match', MultiMatchQuery>) { + return { + function_score: { + query, + functions: [ + { + field_value_factor: { + field: 'cited_by_count', + factor: 1.5, + modifier: 'log1p', + missing: 0, + }, + }, + { + field_value_factor: { + field: 'authors.author_cited_by_count', + factor: 0.1, + modifier: 'log1p', + missing: 0, + }, + }, + ], + boost_mode: 'sum' as QueryDslFunctionBoostMode, + score_mode: 'sum' as QueryDslFunctionBoostMode, + }, + }; +} + +export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { + return { + simple_query_string: { + query: query, + // [entity]: { + // query: query, + // }, + }, + }; +} + +export function buildBoolQuery(queries: any[]) { + return { + query: { + bool: { + // must: [], + should: queries, + // filter: [], + }, + }, + }; +} + +export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: number) { + let fields = []; + if (entity === 'works') fields = RELEVANT_FIELDS.works; + if (entity === 'authors') fields = RELEVANT_FIELDS.denorm_authors; + if (entity === 'works_single') fields = RELEVANT_FIELDS.works_single; + + const type: QueryDslTextQueryType = 'best_fields'; + const multiMatchQuery = { + multi_match: { + query: query, + fields: fields, + type, + fuzziness: fuzzy || 'AUTO', + }, + }; + + if (entity === 'works_single') return scoreBoostFunction(multiMatchQuery) as QueryDslQueryContainer; + return multiMatchQuery as QueryDslQueryContainer; +} + +export function buildSortQuery(entity: string, sortType?: string, sortOrder: SortOrder = 'desc'): SortField[] { + const entityConfig = sortConfigs[entity]; + if (!entityConfig) { + return baseSort; + } + + const sortFunction = entityConfig[sortType] || entityConfig['relevance'] || (() => []); + const specificSort = sortFunction(sortOrder); + + // return [...baseSort]; + return [...specificSort, ...baseSort]; +} + +export type IndexedAuthor = { + _index: string; + _id: string; + _score: number; + _source: { + works_count: number; + display_name: string; + cited_by_count: number; + works_api_url: string; + orcid: string | null; + id: string; + last_known_institution: any | null; + '@timestamp': string; + '@version': string; + updated_date: string; + }; +}; + +export interface MultiMatchQuery { + query: string; + fields: any[]; + type: QueryDslTextQueryType; + fuzziness: string | number; +} diff --git a/desci-server/yarn.lock b/desci-server/yarn.lock index 7f9f2b0fc..1a4d87bcd 100644 --- a/desci-server/yarn.lock +++ b/desci-server/yarn.lock @@ -2366,6 +2366,27 @@ ky-universal "^0.11.0" undici "^5.21.2" +"@elastic/elasticsearch@^8.14.0": + version "8.14.0" + resolved "https://registry.yarnpkg.com/@elastic/elasticsearch/-/elasticsearch-8.14.0.tgz#93b1f2a7cb6cc5cd1ceebf5060576bc690432e0a" + integrity sha512-MGrgCI4y+Ozssf5Q2IkVJlqt5bUMnKIICG2qxeOfrJNrVugMCBCAQypyesmSSocAtNm8IX3LxfJ3jQlFHmKe2w== + dependencies: + "@elastic/transport" "^8.6.0" + tslib "^2.4.0" + +"@elastic/transport@^8.6.0": + version "8.7.0" + resolved "https://registry.yarnpkg.com/@elastic/transport/-/transport-8.7.0.tgz#006987fc5583f61c266e0b1003371e82efc7a6b5" + integrity sha512-IqXT7a8DZPJtqP2qmX1I2QKmxYyN27kvSW4g6pInESE1SuGwZDp2FxHJ6W2kwmYOJwQdAt+2aWwzXO5jHo9l4A== + dependencies: + "@opentelemetry/api" "1.x" + debug "^4.3.4" + hpagent "^1.0.0" + ms "^2.1.3" + secure-json-parse "^2.4.0" + tslib "^2.4.0" + undici "^6.12.0" + "@emotion/is-prop-valid@^0.8.2": version "0.8.8" resolved "https://registry.yarnpkg.com/@emotion/is-prop-valid/-/is-prop-valid-0.8.8.tgz#db28b1c4368a259b60a97311d6a952d4fd01ac1a" @@ -3584,6 +3605,11 @@ dependencies: "@opentelemetry/api" "^1.0.0" +"@opentelemetry/api@1.x": + version "1.9.0" + resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.9.0.tgz#d03eba68273dc0f7509e2a3d5cba21eae10379fe" + integrity sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg== + "@opentelemetry/api@^1.0.0", "@opentelemetry/api@^1.3.0", "@opentelemetry/api@^1.8.0": version "1.8.0" resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.8.0.tgz#5aa7abb48f23f693068ed2999ae627d2f7d902ec" @@ -11314,6 +11340,11 @@ hosted-git-info@^4.0.1: dependencies: lru-cache "^6.0.0" +hpagent@^1.0.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hpagent/-/hpagent-1.2.0.tgz#0ae417895430eb3770c03443456b8d90ca464903" + integrity sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA== + html-comment-regex@^1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/html-comment-regex/-/html-comment-regex-1.1.2.tgz#97d4688aeb5c81886a364faa0cad1dda14d433a7" @@ -13276,7 +13307,7 @@ ms@2.1.2: resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== -ms@2.1.3, ms@^2.0.0, ms@^2.1.1: +ms@2.1.3, ms@^2.0.0, ms@^2.1.1, ms@^2.1.3: version "2.1.3" resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2" integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA== @@ -15716,7 +15747,16 @@ string-template@~0.2.1: resolved "https://registry.yarnpkg.com/string-template/-/string-template-0.2.1.tgz#42932e598a352d01fc22ec3367d9d84eec6c9add" integrity sha512-Yptehjogou2xm4UJbxJ4CxgZx12HBfeystp0y3x7s4Dj32ltVVG1Gg8YhKjHZkHicuKpZX/ffilA8505VbUbpw== -"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: +"string-width-cjs@npm:string-width@^4.2.0": + version "4.2.3" + resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" + integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== + dependencies: + emoji-regex "^8.0.0" + is-fullwidth-code-point "^3.0.0" + strip-ansi "^6.0.1" + +"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: version "4.2.3" resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== @@ -15789,7 +15829,7 @@ stringify-object@3.3.0: is-obj "^1.0.1" is-regexp "^1.0.0" -"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1: +"strip-ansi-cjs@npm:strip-ansi@^6.0.1": version "6.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== @@ -15803,6 +15843,13 @@ strip-ansi@^3.0.0: dependencies: ansi-regex "^2.0.0" +strip-ansi@^6.0.0, strip-ansi@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" + integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== + dependencies: + ansi-regex "^5.0.1" + strip-ansi@^7.0.1: version "7.1.0" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.1.0.tgz#d5b6568ca689d8561370b0707685d22434faff45" @@ -16514,6 +16561,11 @@ undici@^5.21.2: dependencies: "@fastify/busboy" "^2.0.0" +undici@^6.12.0: + version "6.19.2" + resolved "https://registry.yarnpkg.com/undici/-/undici-6.19.2.tgz#231bc5de78d0dafb6260cf454b294576c2f3cd31" + integrity sha512-JfjKqIauur3Q6biAtHJ564e3bWa8VvT+7cSiOJHFbX4Erv6CLGDpg8z+Fmg/1OI/47RA+GI2QZaF48SSaLvyBA== + unique-filename@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/unique-filename/-/unique-filename-1.1.1.tgz#1d69769369ada0583103a1e6ae87681b56573230" @@ -16833,7 +16885,7 @@ workerpool@6.2.1: resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.2.1.tgz#46fc150c17d826b86a008e5a4508656777e9c343" integrity sha512-ILEIE97kDZvF9Wb9f6h5aXK4swSlKGUcOEGiIYb2OOu/IrDU9iwj0fD//SsA6E5ibwJxpEvhullJY4Sl4GcpAw== -"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0: +"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0": version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== @@ -16851,6 +16903,15 @@ wrap-ansi@^6.2.0: string-width "^4.1.0" strip-ansi "^6.0.0" +wrap-ansi@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" + integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== + dependencies: + ansi-styles "^4.0.0" + string-width "^4.1.0" + strip-ansi "^6.0.0" + wrap-ansi@^8.1.0: version "8.1.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214" diff --git a/docker-compose-es.yml b/docker-compose-es.yml new file mode 100644 index 000000000..b5be8b96b --- /dev/null +++ b/docker-compose-es.yml @@ -0,0 +1,40 @@ +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.14.3 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms3g -Xmx8g" + ports: + - "9200:9200" + volumes: + - ./local-data/esdata:/usr/share/elasticsearch/data + extra_hosts: + - "host.docker.internal:host-gateway" + + # logstash: + # build: + # context: . + # dockerfile: ./desci-elastic/Dockerfile-logstash + # volumes: + # - ./desci-elastic/logstash.conf:/usr/share/logstash/pipeline/logstash.conf + # - ./local-data/logstash/drivers:/opt/logstash/drivers + # environment: + # - "LS_JAVA_OPTS=-Xms3g -Xmx4g" + # env_file: + # - .env + # extra_hosts: + # - "host.docker.internal:host-gateway" + # depends_on: + # - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:8.14.3 + environment: + - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 + ports: + - "5601:5601" + extra_hosts: + - "host.docker.internal:host-gateway" + depends_on: + - elasticsearch