From a6ffb6f70220a29b6ecbab335c98f172adbcbef4 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:38:51 +0000 Subject: [PATCH 01/25] configure elastic search client --- .env.example | 7 ++- desci-server/package.json | 1 + desci-server/src/elasticSearchClient.ts | 21 ++++++++ desci-server/yarn.lock | 69 +++++++++++++++++++++++-- 4 files changed, 93 insertions(+), 5 deletions(-) create mode 100644 desci-server/src/elasticSearchClient.ts diff --git a/.env.example b/.env.example index 007959309..b8dffd14a 100755 --- a/.env.example +++ b/.env.example @@ -142,4 +142,9 @@ CROSSREF_NOTIFY_ENDPOINT=endpoint # Automated metadata AUTOMATED_METADATA_API=http://host.docker.internal:5005 -AUTOMATED_METADATA_API_KEY= \ No newline at end of file +AUTOMATED_METADATA_API_KEY= + +# Elastic Search, required for /v1/search endpoints +ELASTIC_SEARCH_NODE_URL= +ELASTIC_SEARCH_USER= +ELASTIC_SEARCH_PW= \ No newline at end of file diff --git a/desci-server/package.json b/desci-server/package.json index 34374a8e1..e38dc8a66 100755 --- a/desci-server/package.json +++ b/desci-server/package.json @@ -59,6 +59,7 @@ "@desci-labs/desci-codex-lib": "^1.1.7", "@desci-labs/desci-contracts": "^0.2.6", "@desci-labs/desci-models": "0.2.9", + "@elastic/elasticsearch": "^8.14.0", "@honeycombio/opentelemetry-node": "^0.3.2", "@ipld/dag-pb": "^4.0.0", "@opentelemetry/api": "^1.8.0", diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts new file mode 100644 index 000000000..01f4bc742 --- /dev/null +++ b/desci-server/src/elasticSearchClient.ts @@ -0,0 +1,21 @@ +import { Client } from '@elastic/elasticsearch'; + +const esNodeUrl = process.env.ELASTIC_SEARCH_NODE_URL; +const esUser = process.env.ELASTIC_SEARCH_USER; +const esPw = process.env.ELASTIC_SEARCH_PW; + +if (!esNodeUrl || !esUser || !esPw) { + console.error('Missing environment variables for ElasticSearch'); +} + +export const elasticClient = new Client({ + node: esNodeUrl, + + auth: { + username: esUser, + password: esPw, + }, + tls: { + rejectUnauthorized: false, // Temporary + }, +}); diff --git a/desci-server/yarn.lock b/desci-server/yarn.lock index 7f9f2b0fc..1a4d87bcd 100644 --- a/desci-server/yarn.lock +++ b/desci-server/yarn.lock @@ -2366,6 +2366,27 @@ ky-universal "^0.11.0" undici "^5.21.2" +"@elastic/elasticsearch@^8.14.0": + version "8.14.0" + resolved "https://registry.yarnpkg.com/@elastic/elasticsearch/-/elasticsearch-8.14.0.tgz#93b1f2a7cb6cc5cd1ceebf5060576bc690432e0a" + integrity sha512-MGrgCI4y+Ozssf5Q2IkVJlqt5bUMnKIICG2qxeOfrJNrVugMCBCAQypyesmSSocAtNm8IX3LxfJ3jQlFHmKe2w== + dependencies: + "@elastic/transport" "^8.6.0" + tslib "^2.4.0" + +"@elastic/transport@^8.6.0": + version "8.7.0" + resolved "https://registry.yarnpkg.com/@elastic/transport/-/transport-8.7.0.tgz#006987fc5583f61c266e0b1003371e82efc7a6b5" + integrity sha512-IqXT7a8DZPJtqP2qmX1I2QKmxYyN27kvSW4g6pInESE1SuGwZDp2FxHJ6W2kwmYOJwQdAt+2aWwzXO5jHo9l4A== + dependencies: + "@opentelemetry/api" "1.x" + debug "^4.3.4" + hpagent "^1.0.0" + ms "^2.1.3" + secure-json-parse "^2.4.0" + tslib "^2.4.0" + undici "^6.12.0" + "@emotion/is-prop-valid@^0.8.2": version "0.8.8" resolved "https://registry.yarnpkg.com/@emotion/is-prop-valid/-/is-prop-valid-0.8.8.tgz#db28b1c4368a259b60a97311d6a952d4fd01ac1a" @@ -3584,6 +3605,11 @@ dependencies: "@opentelemetry/api" "^1.0.0" +"@opentelemetry/api@1.x": + version "1.9.0" + resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.9.0.tgz#d03eba68273dc0f7509e2a3d5cba21eae10379fe" + integrity sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg== + "@opentelemetry/api@^1.0.0", "@opentelemetry/api@^1.3.0", "@opentelemetry/api@^1.8.0": version "1.8.0" resolved "https://registry.yarnpkg.com/@opentelemetry/api/-/api-1.8.0.tgz#5aa7abb48f23f693068ed2999ae627d2f7d902ec" @@ -11314,6 +11340,11 @@ hosted-git-info@^4.0.1: dependencies: lru-cache "^6.0.0" +hpagent@^1.0.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hpagent/-/hpagent-1.2.0.tgz#0ae417895430eb3770c03443456b8d90ca464903" + integrity sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA== + html-comment-regex@^1.1.2: version "1.1.2" resolved "https://registry.yarnpkg.com/html-comment-regex/-/html-comment-regex-1.1.2.tgz#97d4688aeb5c81886a364faa0cad1dda14d433a7" @@ -13276,7 +13307,7 @@ ms@2.1.2: resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== -ms@2.1.3, ms@^2.0.0, ms@^2.1.1: +ms@2.1.3, ms@^2.0.0, ms@^2.1.1, ms@^2.1.3: version "2.1.3" resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2" integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA== @@ -15716,7 +15747,16 @@ string-template@~0.2.1: resolved "https://registry.yarnpkg.com/string-template/-/string-template-0.2.1.tgz#42932e598a352d01fc22ec3367d9d84eec6c9add" integrity sha512-Yptehjogou2xm4UJbxJ4CxgZx12HBfeystp0y3x7s4Dj32ltVVG1Gg8YhKjHZkHicuKpZX/ffilA8505VbUbpw== -"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: +"string-width-cjs@npm:string-width@^4.2.0": + version "4.2.3" + resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" + integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== + dependencies: + emoji-regex "^8.0.0" + is-fullwidth-code-point "^3.0.0" + strip-ansi "^6.0.1" + +"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3: version "4.2.3" resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== @@ -15789,7 +15829,7 @@ stringify-object@3.3.0: is-obj "^1.0.1" is-regexp "^1.0.0" -"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1: +"strip-ansi-cjs@npm:strip-ansi@^6.0.1": version "6.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== @@ -15803,6 +15843,13 @@ strip-ansi@^3.0.0: dependencies: ansi-regex "^2.0.0" +strip-ansi@^6.0.0, strip-ansi@^6.0.1: + version "6.0.1" + resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" + integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== + dependencies: + ansi-regex "^5.0.1" + strip-ansi@^7.0.1: version "7.1.0" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.1.0.tgz#d5b6568ca689d8561370b0707685d22434faff45" @@ -16514,6 +16561,11 @@ undici@^5.21.2: dependencies: "@fastify/busboy" "^2.0.0" +undici@^6.12.0: + version "6.19.2" + resolved "https://registry.yarnpkg.com/undici/-/undici-6.19.2.tgz#231bc5de78d0dafb6260cf454b294576c2f3cd31" + integrity sha512-JfjKqIauur3Q6biAtHJ564e3bWa8VvT+7cSiOJHFbX4Erv6CLGDpg8z+Fmg/1OI/47RA+GI2QZaF48SSaLvyBA== + unique-filename@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/unique-filename/-/unique-filename-1.1.1.tgz#1d69769369ada0583103a1e6ae87681b56573230" @@ -16833,7 +16885,7 @@ workerpool@6.2.1: resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.2.1.tgz#46fc150c17d826b86a008e5a4508656777e9c343" integrity sha512-ILEIE97kDZvF9Wb9f6h5aXK4swSlKGUcOEGiIYb2OOu/IrDU9iwj0fD//SsA6E5ibwJxpEvhullJY4Sl4GcpAw== -"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0: +"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0": version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== @@ -16851,6 +16903,15 @@ wrap-ansi@^6.2.0: string-width "^4.1.0" strip-ansi "^6.0.0" +wrap-ansi@^7.0.0: + version "7.0.0" + resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" + integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== + dependencies: + ansi-styles "^4.0.0" + string-width "^4.1.0" + strip-ansi "^6.0.0" + wrap-ansi@^8.1.0: version "8.1.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214" From 71001598bd8c24c248f9b29f1c92496cc0580f2a Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 17 Jul 2024 17:39:17 +0000 Subject: [PATCH 02/25] add basic single query endpoint --- desci-server/src/controllers/search/query.ts | 103 +++++++++++++++++++ desci-server/src/routes/v1/index.ts | 2 + desci-server/src/routes/v1/search.ts | 10 ++ 3 files changed, 115 insertions(+) create mode 100644 desci-server/src/controllers/search/query.ts create mode 100644 desci-server/src/routes/v1/search.ts diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts new file mode 100644 index 000000000..634ec6927 --- /dev/null +++ b/desci-server/src/controllers/search/query.ts @@ -0,0 +1,103 @@ +import { Request, Response } from 'express'; + +import { elasticClient } from '../../elasticSearchClient.js'; +import { logger as parentLogger } from '../../logger.js'; + +interface QuerySearchParams { + query: string; + entity: string; + fuzzy?: number; + sortType?: string; + sortOrder?: 'asc' | 'desc'; + page?: number; + perPage?: number; +} + +export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publishers', 'sources', 'topics', 'works']; + +export const singleQuery = async (req: Request, res: Response) => { + const { + query, + entity, + fuzzy, + sortType = 'relevance', + sortOrder, + page = 1, + perPage = 10, + }: QuerySearchParams = req.body; + const logger = parentLogger.child({ + module: 'SEARCH::Query', + query, + entity, + fuzzy, + sortType, + sortOrder, + page, + perPage, + }); + + logger.trace({ fn: 'Executing elastic search query' }); + + if (!VALID_ENTITIES.includes(entity)) { + return res.status(400).json({ + ok: false, + error: `Invalid entity: ${entity}, qthe following entities are supported: ${VALID_ENTITIES.join(' ')}`, + }); + } + + const esQuery = buildElasticSearchQuery(query, entity, fuzzy); + const esSort = buildSortQuery(sortType, sortOrder); + + try { + debugger; + logger.debug({ esQuery, esSort }, 'Executing query'); + const resp = await elasticClient.search({ + index: entity, + body: { + query: esQuery, + sort: esSort, + from: (page - 1) * perPage, + size: perPage, + }, + }); + debugger; + logger.info({ fn: 'Elastic search query executed successfully' }); + return res.status(200).send(resp); + + // res.json({ + // ok: true, + // data: hits.hits, + // total: hits.total.value, + // page, + // perPage, + // }); + } catch (error) { + logger.error({ fn: 'Elastic search query failed', error }); + return res.status(500).json({ + ok: false, + error: 'An error occurred while searching', + }); + } +}; + +function buildElasticSearchQuery(query: string, entity: string, fuzzy: number) { + return { + query_string: { + query: `${entity} AND (${query})`, + // [entity]: { + // query: query, + // }, + }, + }; +} + +function buildSortQuery(sortType: string, sortOrder?: string) { + const order = sortOrder === 'asc' ? 'asc' : 'desc'; + switch (sortType) { + case 'date': + return [{ year: order }]; + case 'relevance': + default: + return ['_score']; + } +} diff --git a/desci-server/src/routes/v1/index.ts b/desci-server/src/routes/v1/index.ts index 6fd4a4b07..a87e0ebae 100755 --- a/desci-server/src/routes/v1/index.ts +++ b/desci-server/src/routes/v1/index.ts @@ -21,6 +21,7 @@ import log from './log.js'; import nodes from './nodes.js'; import pub from './pub.js'; import referral from './referral.js'; +import search from './search.js'; import services from './services.js'; import users from './users.js'; import waitlist from './waitlist.js'; @@ -58,6 +59,7 @@ router.use('/services', services); router.use('/communities', communities); router.use('/attestations', attestations); router.use('/doi', doi); +router.use('/search', search); router.get('/nft/:id', nft); router.use('/referral', referral); diff --git a/desci-server/src/routes/v1/search.ts b/desci-server/src/routes/v1/search.ts new file mode 100644 index 000000000..b9bd3f8a0 --- /dev/null +++ b/desci-server/src/routes/v1/search.ts @@ -0,0 +1,10 @@ +import { Router } from 'express'; + +import { singleQuery } from '../../controllers/search/query.js'; +import { ensureUser } from '../../internal.js'; + +const router = Router(); + +router.post('/', [ensureUser], singleQuery); + +export default router; From 787843e950c6008c98c51b7449a5397386b0cb41 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:33:51 +0000 Subject: [PATCH 03/25] split es query building functions into its own svc --- .../src/services/ElasticSearchService.ts | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 desci-server/src/services/ElasticSearchService.ts diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts new file mode 100644 index 000000000..563b963ee --- /dev/null +++ b/desci-server/src/services/ElasticSearchService.ts @@ -0,0 +1,35 @@ +export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publishers', 'sources', 'topics', 'works']; + +export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { + return { + simple_query_string: { + query: query, + // [entity]: { + // query: query, + // }, + }, + }; +} + +export function buildBoolQuery(queries: any[]) { + return { + query: { + bool: { + // must: [], + should: queries, + // filter: [], + }, + }, + }; +} + +export function buildSortQuery(sortType: string, sortOrder?: string) { + const order = sortOrder === 'asc' ? 'asc' : 'desc'; + switch (sortType) { + case 'date': + return [{ year: order }]; + case 'relevance': + default: + return ['_score']; + } +} From c7ea5af850ece9eb9d67a3463ec50fc72423c924 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:34:22 +0000 Subject: [PATCH 04/25] add base for multiquerying --- .../src/controllers/search/multiQuery.ts | 91 +++++++++++++++++++ desci-server/src/controllers/search/query.ts | 29 +----- desci-server/src/routes/v1/search.ts | 2 + 3 files changed, 96 insertions(+), 26 deletions(-) create mode 100644 desci-server/src/controllers/search/multiQuery.ts diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts new file mode 100644 index 000000000..2481513de --- /dev/null +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -0,0 +1,91 @@ +import { Request, Response } from 'express'; + +import { elasticClient } from '../../elasticSearchClient.js'; +import { logger as parentLogger } from '../../logger.js'; +import { + buildBoolQuery, + buildSimpleStringQuery, + buildSortQuery, + VALID_ENTITIES, +} from '../../services/ElasticSearchService.js'; + +type Entity = string; +type Query = string; + +type QueryObject = Record; + +interface MultiQuerySearchParams { + queries: QueryObject[]; + fuzzy?: number; + sortType?: string; + sortOrder?: 'asc' | 'desc'; + page?: number; + perPage?: number; +} + +export const multiQuery = async (req: Request, res: Response) => { + const { + queries, + fuzzy, + sortType = 'relevance', + sortOrder, + page = 1, + perPage = 10, + }: MultiQuerySearchParams = req.body; + const logger = parentLogger.child({ + module: 'SEARCH::MultiQuery', + queries, + fuzzy, + sortType, + sortOrder, + page, + perPage, + }); + + logger.trace({ fn: 'Executing elastic search query' }); + + const validEntityQueries = queries.filter((q) => VALID_ENTITIES.includes(Object.keys(q)[0])); + + if (!validEntityQueries) { + return res.status(400).json({ + ok: false, + error: `Invalid queries, the following entities are supported: ${VALID_ENTITIES.join(' ')}`, + }); + } + + const esQueries = validEntityQueries.map((q) => { + return buildSimpleStringQuery(Object.values(q)[0], Object.keys(q)[0]); + }); + const esSort = buildSortQuery(sortType, sortOrder); + const esBoolQuery = buildBoolQuery(esQueries); + + try { + debugger; + logger.debug({ esQueries, esSort }, 'Executing query'); + const resp = await elasticClient.search({ + body: { + ...esBoolQuery, + sort: esSort, + from: (page - 1) * perPage, + size: perPage, + }, + }); + debugger; + logger.info({ fn: 'Elastic search query executed successfully' }); + return res.status(200).send(resp); + + // res.json({ + // ok: true, + // data: hits.hits, + // total: hits.total.value, + // page, + // perPage, + // }); + } catch (error) { + logger.error({ fn: 'Elastic search query failed', error }); + return res.status(500).json({ + ok: false, + error: 'An error occurred while searching', + }); + } +}; diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index 634ec6927..6f38e5896 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -2,6 +2,7 @@ import { Request, Response } from 'express'; import { elasticClient } from '../../elasticSearchClient.js'; import { logger as parentLogger } from '../../logger.js'; +import { buildSimpleStringQuery, buildSortQuery, VALID_ENTITIES } from '../../services/ElasticSearchService.js'; interface QuerySearchParams { query: string; @@ -13,8 +14,6 @@ interface QuerySearchParams { perPage?: number; } -export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publishers', 'sources', 'topics', 'works']; - export const singleQuery = async (req: Request, res: Response) => { const { query, @@ -41,11 +40,11 @@ export const singleQuery = async (req: Request, res: Response) => { if (!VALID_ENTITIES.includes(entity)) { return res.status(400).json({ ok: false, - error: `Invalid entity: ${entity}, qthe following entities are supported: ${VALID_ENTITIES.join(' ')}`, + error: `Invalid entity: ${entity}, the following entities are supported: ${VALID_ENTITIES.join(' ')}`, }); } - const esQuery = buildElasticSearchQuery(query, entity, fuzzy); + const esQuery = buildSimpleStringQuery(query, entity, fuzzy); const esSort = buildSortQuery(sortType, sortOrder); try { @@ -79,25 +78,3 @@ export const singleQuery = async (req: Request, res: Response) => { }); } }; - -function buildElasticSearchQuery(query: string, entity: string, fuzzy: number) { - return { - query_string: { - query: `${entity} AND (${query})`, - // [entity]: { - // query: query, - // }, - }, - }; -} - -function buildSortQuery(sortType: string, sortOrder?: string) { - const order = sortOrder === 'asc' ? 'asc' : 'desc'; - switch (sortType) { - case 'date': - return [{ year: order }]; - case 'relevance': - default: - return ['_score']; - } -} diff --git a/desci-server/src/routes/v1/search.ts b/desci-server/src/routes/v1/search.ts index b9bd3f8a0..4e88713cc 100644 --- a/desci-server/src/routes/v1/search.ts +++ b/desci-server/src/routes/v1/search.ts @@ -1,10 +1,12 @@ import { Router } from 'express'; +import { multiQuery } from '../../controllers/search/multiQuery.js'; import { singleQuery } from '../../controllers/search/query.js'; import { ensureUser } from '../../internal.js'; const router = Router(); +router.post('/multi', [ensureUser], multiQuery); router.post('/', [ensureUser], singleQuery); export default router; From add0932427d848bcfca3cf432ea8a6bc3ac455c4 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:22:05 +0000 Subject: [PATCH 05/25] refactor, add types, add multiMatch query builder helper --- .../src/controllers/search/multiQuery.ts | 26 +++++++----- desci-server/src/controllers/search/query.ts | 41 +++++++++++++------ .../src/services/ElasticSearchService.ts | 41 +++++++++++++++++++ 3 files changed, 85 insertions(+), 23 deletions(-) diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index 2481513de..6bbae38a1 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -9,6 +9,8 @@ import { VALID_ENTITIES, } from '../../services/ElasticSearchService.js'; +import { SingleQueryErrorResponse, SingleQuerySuccessResponse } from './query.js'; + type Entity = string; type Query = string; @@ -23,7 +25,10 @@ interface MultiQuerySearchParams { perPage?: number; } -export const multiQuery = async (req: Request, res: Response) => { +export const multiQuery = async ( + req: Request, + res: Response, +) => { const { queries, fuzzy, @@ -62,7 +67,7 @@ export const multiQuery = async (req: Request, res: Response) => { try { debugger; logger.debug({ esQueries, esSort }, 'Executing query'); - const resp = await elasticClient.search({ + const { hits } = await elasticClient.search({ body: { ...esBoolQuery, sort: esSort, @@ -71,16 +76,15 @@ export const multiQuery = async (req: Request, res: Response) => { }, }); debugger; - logger.info({ fn: 'Elastic search query executed successfully' }); - return res.status(200).send(resp); + logger.info({ fn: 'Elastic search multi query executed successfully' }); - // res.json({ - // ok: true, - // data: hits.hits, - // total: hits.total.value, - // page, - // perPage, - // }); + return res.json({ + ok: true, + total: hits.total, + page, + perPage, + data: hits.hits, + }); } catch (error) { logger.error({ fn: 'Elastic search query failed', error }); return res.status(500).json({ diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index 6f38e5896..fa2b4c86d 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -1,10 +1,24 @@ +import { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types.js'; import { Request, Response } from 'express'; import { elasticClient } from '../../elasticSearchClient.js'; import { logger as parentLogger } from '../../logger.js'; import { buildSimpleStringQuery, buildSortQuery, VALID_ENTITIES } from '../../services/ElasticSearchService.js'; -interface QuerySearchParams { +export interface SingleQuerySuccessResponse { + ok: true; + page: number; + perPage: number; + total: number | SearchTotalHits; + data: any[]; +} + +export interface SingleQueryErrorResponse { + ok: false; + error: string; +} + +interface QuerySearchBodyParams { query: string; entity: string; fuzzy?: number; @@ -14,7 +28,10 @@ interface QuerySearchParams { perPage?: number; } -export const singleQuery = async (req: Request, res: Response) => { +export const singleQuery = async ( + req: Request, + res: Response, +) => { const { query, entity, @@ -23,7 +40,7 @@ export const singleQuery = async (req: Request, res: Response) => { sortOrder, page = 1, perPage = 10, - }: QuerySearchParams = req.body; + }: QuerySearchBodyParams = req.body; const logger = parentLogger.child({ module: 'SEARCH::Query', query, @@ -50,7 +67,7 @@ export const singleQuery = async (req: Request, res: Response) => { try { debugger; logger.debug({ esQuery, esSort }, 'Executing query'); - const resp = await elasticClient.search({ + const { hits } = await elasticClient.search({ index: entity, body: { query: esQuery, @@ -61,15 +78,15 @@ export const singleQuery = async (req: Request, res: Response) => { }); debugger; logger.info({ fn: 'Elastic search query executed successfully' }); - return res.status(200).send(resp); + // return res.status(200).send({ esQuery, resp }); - // res.json({ - // ok: true, - // data: hits.hits, - // total: hits.total.value, - // page, - // perPage, - // }); + return res.json({ + ok: true, + total: hits.total, + page, + perPage, + data: hits.hits, + }); } catch (error) { logger.error({ fn: 'Elastic search query failed', error }); return res.status(500).json({ diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index 563b963ee..c81776fb7 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -1,5 +1,14 @@ export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publishers', 'sources', 'topics', 'works']; +/** + * Ordered from most relevant to least relevant + */ +export const RELEVANT_FIELDS = { + works: ['title', 'abstract', 'doi'], + authors: ['display_name', 'orcid'], +}; +// abstract_inverted_index + export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { return { simple_query_string: { @@ -23,6 +32,20 @@ export function buildBoolQuery(queries: any[]) { }; } +export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: number) { + let fields = []; + if (entity === 'works') fields = RELEVANT_FIELDS.works; + if (entity === 'authors') fields = RELEVANT_FIELDS.works; + return { + multi_match: { + query: query, + fields: fields, + type: 'best_fields', + fuzziness: fuzzy || 'AUTO', + }, + }; +} + export function buildSortQuery(sortType: string, sortOrder?: string) { const order = sortOrder === 'asc' ? 'asc' : 'desc'; switch (sortType) { @@ -33,3 +56,21 @@ export function buildSortQuery(sortType: string, sortOrder?: string) { return ['_score']; } } + +export type IndexedAuthor = { + _index: string; + _id: string; + _score: number; + _source: { + works_count: number; + display_name: string; + cited_by_count: number; + works_api_url: string; + orcid: string | null; + id: string; + last_known_institution: any | null; + '@timestamp': string; + '@version': string; + updated_date: string; + }; +}; From 17dc28d842ff818f394a6d8aa97af3c03c215dcf Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:23:22 +0000 Subject: [PATCH 06/25] sorting overhaul, bug fixes --- .../src/controllers/search/multiQuery.ts | 15 ++++--- desci-server/src/controllers/search/query.ts | 16 +++++-- .../src/services/ElasticSearchService.ts | 43 ++++++++++++++----- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index 6bbae38a1..5eff8525b 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -4,12 +4,13 @@ import { elasticClient } from '../../elasticSearchClient.js'; import { logger as parentLogger } from '../../logger.js'; import { buildBoolQuery, + buildMultiMatchQuery, buildSimpleStringQuery, buildSortQuery, VALID_ENTITIES, } from '../../services/ElasticSearchService.js'; -import { SingleQueryErrorResponse, SingleQuerySuccessResponse } from './query.js'; +import { QueryDebuggingResponse, SingleQueryErrorResponse, SingleQuerySuccessResponse } from './query.js'; type Entity = string; type Query = string; @@ -27,7 +28,7 @@ interface MultiQuerySearchParams { export const multiQuery = async ( req: Request, - res: Response, + res: Response<(SingleQuerySuccessResponse & QueryDebuggingResponse) | SingleQueryErrorResponse>, ) => { const { queries, @@ -59,13 +60,14 @@ export const multiQuery = async ( } const esQueries = validEntityQueries.map((q) => { - return buildSimpleStringQuery(Object.values(q)[0], Object.keys(q)[0]); + const [entity, query] = Object.entries(q)[0]; + return buildMultiMatchQuery(query, entity); }); - const esSort = buildSortQuery(sortType, sortOrder); + const primaryEntity = Object.keys(validEntityQueries[0])[0]; + const esSort = buildSortQuery(primaryEntity, sortType, sortOrder); const esBoolQuery = buildBoolQuery(esQueries); try { - debugger; logger.debug({ esQueries, esSort }, 'Executing query'); const { hits } = await elasticClient.search({ body: { @@ -75,10 +77,11 @@ export const multiQuery = async ( size: perPage, }, }); - debugger; + debugger; // logger.info({ fn: 'Elastic search multi query executed successfully' }); return res.json({ + esQueries, ok: true, total: hits.total, page, diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index fa2b4c86d..571c336c6 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -13,6 +13,12 @@ export interface SingleQuerySuccessResponse { data: any[]; } +export interface QueryDebuggingResponse { + esQuery?: any; + esQueries?: any; + esSort?: any; +} + export interface SingleQueryErrorResponse { ok: false; error: string; @@ -30,7 +36,7 @@ interface QuerySearchBodyParams { export const singleQuery = async ( req: Request, - res: Response, + res: Response, ) => { const { query, @@ -62,10 +68,10 @@ export const singleQuery = async ( } const esQuery = buildSimpleStringQuery(query, entity, fuzzy); - const esSort = buildSortQuery(sortType, sortOrder); + const esSort = buildSortQuery(entity, sortType, sortOrder); try { - debugger; + // debugger; logger.debug({ esQuery, esSort }, 'Executing query'); const { hits } = await elasticClient.search({ index: entity, @@ -76,7 +82,7 @@ export const singleQuery = async ( size: perPage, }, }); - debugger; + // debugger; logger.info({ fn: 'Elastic search query executed successfully' }); // return res.status(200).send({ esQuery, resp }); @@ -92,6 +98,8 @@ export const singleQuery = async ( return res.status(500).json({ ok: false, error: 'An error occurred while searching', + esQuery, + esSort, }); } }; diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index c81776fb7..cd1b8898f 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -5,10 +5,32 @@ export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publisher */ export const RELEVANT_FIELDS = { works: ['title', 'abstract', 'doi'], - authors: ['display_name', 'orcid'], + authors: ['display_name', 'orcid', 'last_known_institution'], }; // abstract_inverted_index +type SortOrder = 'asc' | 'desc'; +type SortField = { [field: string]: { order: SortOrder; missing?: string } }; + +const baseSort: SortField[] = [{ _score: { order: 'desc' } }]; + +const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) => SortField[] } } = { + works: { + publication_year: (order) => [{ publication_year: { order, missing: '_last' } }], + publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], + relevance: () => [{ publication_year: { order: 'desc', missing: '_last' } }], + }, + authors: { + display_name: (order) => [{ 'display_name.keyword': { order, missing: '_last' } }], + works_count: (order) => [{ works_count: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + updated_date: (order) => [{ updated_date: { order, missing: '_last' } }], + relevance: () => [], + }, +}; + export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { return { simple_query_string: { @@ -35,7 +57,7 @@ export function buildBoolQuery(queries: any[]) { export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: number) { let fields = []; if (entity === 'works') fields = RELEVANT_FIELDS.works; - if (entity === 'authors') fields = RELEVANT_FIELDS.works; + if (entity === 'authors') fields = RELEVANT_FIELDS.authors; return { multi_match: { query: query, @@ -46,15 +68,16 @@ export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: numb }; } -export function buildSortQuery(sortType: string, sortOrder?: string) { - const order = sortOrder === 'asc' ? 'asc' : 'desc'; - switch (sortType) { - case 'date': - return [{ year: order }]; - case 'relevance': - default: - return ['_score']; +export function buildSortQuery(entity: string, sortType?: string, sortOrder: SortOrder = 'desc'): SortField[] { + const entityConfig = sortConfigs[entity]; + if (!entityConfig) { + return baseSort; } + + const sortFunction = entityConfig[sortType] || entityConfig['relevance'] || (() => []); + const specificSort = sortFunction(sortOrder); + + return [...specificSort, ...baseSort]; } export type IndexedAuthor = { From 94376b9ea632504fa64c18a63bfd8bcc1f8b9fe6 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:29:17 +0000 Subject: [PATCH 07/25] local dev elastic search/kibana/logstash setup --- .env.example | 10 +++++++- desci-elastic/Dockerfile-logstash | 15 ++++++++++++ desci-elastic/init-logstash.sh | 38 +++++++++++++++++++++++++++++++ desci-elastic/logstash.conf | 30 ++++++++++++++++++++++++ docker-compose-es.yml | 38 +++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 desci-elastic/Dockerfile-logstash create mode 100644 desci-elastic/init-logstash.sh create mode 100644 desci-elastic/logstash.conf create mode 100644 docker-compose-es.yml diff --git a/.env.example b/.env.example index b8dffd14a..0a41729d7 100755 --- a/.env.example +++ b/.env.example @@ -147,4 +147,12 @@ AUTOMATED_METADATA_API_KEY= # Elastic Search, required for /v1/search endpoints ELASTIC_SEARCH_NODE_URL= ELASTIC_SEARCH_USER= -ELASTIC_SEARCH_PW= \ No newline at end of file +ELASTIC_SEARCH_PW= + +# Elastic search local dev node configuration +ES_NODE=http://host.docker.internal:9200 +ES_DB_HOST= +ES_DB_PORT= +ES_DB_NAME= +ES_DB_USER= +ES_DB_PASSWORD= diff --git a/desci-elastic/Dockerfile-logstash b/desci-elastic/Dockerfile-logstash new file mode 100644 index 000000000..2b5dd33e0 --- /dev/null +++ b/desci-elastic/Dockerfile-logstash @@ -0,0 +1,15 @@ +FROM docker.elastic.co/logstash/logstash:8.14.3 + +USER root + +# Install curl +RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* + +# Copy the initialization script +COPY desci-elastic/init-logstash.sh /usr/local/bin/init-logstash.sh +RUN chmod +x /usr/local/bin/init-logstash.sh + +USER logstash + +# Set the entrypoint to the initialization script +ENTRYPOINT ["/usr/local/bin/init-logstash.sh"] \ No newline at end of file diff --git a/desci-elastic/init-logstash.sh b/desci-elastic/init-logstash.sh new file mode 100644 index 000000000..d613fb2b3 --- /dev/null +++ b/desci-elastic/init-logstash.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +# Configuration +DRIVER_URL="https://jdbc.postgresql.org/download/postgresql-42.7.3.jar" +DRIVER_DIR="/opt/logstash/drivers" +DRIVER_FILE="$DRIVER_DIR/postgresql-42.7.3.jar" + +# Ensure the driver directory exists +mkdir -p "$DRIVER_DIR" + + +download_driver() { + echo "Downloading PostgreSQL JDBC driver..." + curl -# -o "$DRIVER_FILE" "$DRIVER_URL" + chmod 644 "$DRIVER_FILE" + echo "Driver downloaded and permissions set." +} + +# Check if driver exists and download if necessary +if [ -f "$DRIVER_FILE" ]; then + echo "PostgreSQL JDBC driver already exists." +else + download_driver +fi + +# Verify the driver file +if [ ! -f "$DRIVER_FILE" ]; then + echo "Error: Failed to download or locate the PostgreSQL JDBC driver." + exit 1 +fi + +# Ensure correct permissions on the driver file +chmod 644 "$DRIVER_FILE" + +# Start Logstash with the provided pipeline configuration +exec logstash -f /usr/share/logstash/pipeline/logstash.conf \ No newline at end of file diff --git a/desci-elastic/logstash.conf b/desci-elastic/logstash.conf new file mode 100644 index 000000000..ceb733317 --- /dev/null +++ b/desci-elastic/logstash.conf @@ -0,0 +1,30 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + schedule => "*/5 * * * *" + statement => "SELECT * FROM works WHERE id <= 1000 ORDER BY id ASC LIMIT 1000" + use_column_value => true + tracking_column => "id" + tracking_column_type => "numeric" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } +} + +output { + elasticsearch { + hosts => ["${ES_NODE}"] + index => "works" + document_id => "%{id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/docker-compose-es.yml b/docker-compose-es.yml new file mode 100644 index 000000000..23e6e7217 --- /dev/null +++ b/docker-compose-es.yml @@ -0,0 +1,38 @@ +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.14.3 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + ports: + - "9200:9200" + volumes: + - ./local-data/esdata:/usr/share/elasticsearch/data + extra_hosts: + - "host.docker.internal:host-gateway" + + logstash: + build: + context: . + dockerfile: ./desci-elastic/Dockerfile-logstash + volumes: + - ./desci-elastic/logstash.conf:/usr/share/logstash/pipeline/logstash.conf + - ./local-data/logstash/drivers:/opt/logstash/drivers + env_file: + - .env + extra_hosts: + - "host.docker.internal:host-gateway" + depends_on: + - elasticsearch + + kibana: + image: docker.elastic.co/kibana/kibana:8.14.3 + environment: + - ELASTICSEARCH_HOSTS=http://elasticsearch:9200 + ports: + - "5601:5601" + extra_hosts: + - "host.docker.internal:host-gateway" + depends_on: + - elasticsearch From 9cd806db43ff059cc4d86986b2e7d7147a17f50d Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:56:13 +0000 Subject: [PATCH 08/25] working works/authors logstash confs --- desci-elastic/logstash-authors.conf | 62 ++++++++++++++++++++++++++++ desci-elastic/logstash-works.conf | 64 +++++++++++++++++++++++++++++ desci-elastic/logstash.conf | 30 -------------- docker-compose-es.yml | 2 +- 4 files changed, 127 insertions(+), 31 deletions(-) create mode 100644 desci-elastic/logstash-authors.conf create mode 100644 desci-elastic/logstash-works.conf delete mode 100644 desci-elastic/logstash.conf diff --git a/desci-elastic/logstash-authors.conf b/desci-elastic/logstash-authors.conf new file mode 100644 index 000000000..5a9d9b3cc --- /dev/null +++ b/desci-elastic/logstash-authors.conf @@ -0,0 +1,62 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + id::text, + orcid::text, + display_name::text, + display_name_alternatives::text, + works_count::text, + cited_by_count::text, + last_known_institution::text, + works_api_url::text, + updated_date::text + FROM openalex.authors + WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY updated_date ASC, id ASC + LIMIT 1000 + " + use_column_value => true + tracking_column => "updated_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + jdbc_paging_enabled => true + jdbc_page_size => 1000 + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "display_name_alternatives" + target => "display_name_alternatives" + skip_on_invalid_json => true + } + json { + source => "last_known_institution" + target => "last_known_institution" + skip_on_invalid_json => true + } + mutate { + convert => { + "works_count" => "integer" + "cited_by_count" => "integer" + } + } +} + +output { + elasticsearch { + hosts => ["${ES_NODE}"] + index => "authors" + document_id => "%{id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-elastic/logstash-works.conf b/desci-elastic/logstash-works.conf new file mode 100644 index 000000000..12c51fe9b --- /dev/null +++ b/desci-elastic/logstash-works.conf @@ -0,0 +1,64 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + id::TEXT, + orcid::TEXT, + display_name::TEXT, + display_name_alternatives::TEXT, + works_count::TEXT, + cited_by_count::TEXT, + last_known_institution::TEXT, + works_api_url::TEXT, + updated_date::TEXT + FROM openalex.authors + WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY updated_date ASC, id ASC + LIMIT 1000 + " + use_column_value => true + tracking_column => "updated_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + jdbc_paging_enabled => true + jdbc_page_size => 1000 + codec => json + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "display_name_alternatives" + target => "display_name_alternatives" + skip_on_invalid_json => true + } + json { + source => "last_known_institution" + target => "last_known_institution" + skip_on_invalid_json => true + } + mutate { + convert => { + "works_count" => "integer" + "cited_by_count" => "integer" + } + } +} + +output { + stdout { codec => json } + elasticsearch { + hosts => ["${ES_NODE}"] + index => "authors" + document_id => "%{id}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/desci-elastic/logstash.conf b/desci-elastic/logstash.conf deleted file mode 100644 index ceb733317..000000000 --- a/desci-elastic/logstash.conf +++ /dev/null @@ -1,30 +0,0 @@ -input { - jdbc { - jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" - jdbc_driver_class => "org.postgresql.Driver" - jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" - jdbc_user => "${ES_DB_USER}" - jdbc_password => "${ES_DB_PASSWORD}" - schedule => "*/5 * * * *" - statement => "SELECT * FROM works WHERE id <= 1000 ORDER BY id ASC LIMIT 1000" - use_column_value => true - tracking_column => "id" - tracking_column_type => "numeric" - last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" - } -} - -filter { - mutate { - remove_field => ["@version", "@timestamp"] - } -} - -output { - elasticsearch { - hosts => ["${ES_NODE}"] - index => "works" - document_id => "%{id}" - doc_as_upsert => true - } -} \ No newline at end of file diff --git a/docker-compose-es.yml b/docker-compose-es.yml index 23e6e7217..6d946a403 100644 --- a/docker-compose-es.yml +++ b/docker-compose-es.yml @@ -4,7 +4,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" ports: - "9200:9200" volumes: From e4723f4b099043bb3528a9f68bc14bae577f08cb Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:45:36 +0000 Subject: [PATCH 09/25] index works/authorships logstash config --- desci-elastic/logstash-works-authorships.conf | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 desci-elastic/logstash-works-authorships.conf diff --git a/desci-elastic/logstash-works-authorships.conf b/desci-elastic/logstash-works-authorships.conf new file mode 100644 index 000000000..3279d3a0f --- /dev/null +++ b/desci-elastic/logstash-works-authorships.conf @@ -0,0 +1,37 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + work_id, + author_id, + author_position, + raw_affiliation_string, + institution_id + FROM openalex.works_authorships + ORDER BY work_id ASC + LIMIT 10000 + " + jdbc_paging_enabled => true + jdbc_page_size => 1000 + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } +} + +output { + elasticsearch { + hosts => ["${ES_NODE}"] + index => "works_authorships" + document_id => "%{work_id}-%{author_id}" + doc_as_upsert => true + } +} \ No newline at end of file From 0f06349c673a998c62d62b412b1d8361160e1731 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 24 Jul 2024 18:25:57 +0000 Subject: [PATCH 10/25] added denormalized works+authorships logstash conf --- .../logstash-denorm-works-authorships.conf | 96 +++++++++++++++++++ docker-compose-es.yml | 4 +- 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 desci-elastic/logstash-denorm-works-authorships.conf diff --git a/desci-elastic/logstash-denorm-works-authorships.conf b/desci-elastic/logstash-denorm-works-authorships.conf new file mode 100644 index 000000000..b58d62c8c --- /dev/null +++ b/desci-elastic/logstash-denorm-works-authorships.conf @@ -0,0 +1,96 @@ +input { + jdbc { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + jdbc_paging_enabled => true + jdbc_page_size => 100 + use_column_value => true + tracking_column => "publication_date" + tracking_column_type => "timestamp" + last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" + statement => " + SELECT + w.id::TEXT AS work_id, + w.doi::TEXT, + w.title::TEXT, + w.publication_year::TEXT, + w.type::TEXT, + w.cited_by_count::TEXT AS cited_by_count, + w.abstract_inverted_index::TEXT as abstract_inverted_index, + w.publication_date::TIMESTAMP AS publication_date + FROM openalex.works w + WHERE w.publication_date::TIMESTAMP > CAST(:sql_last_value AS TIMESTAMP) + ORDER BY w.publication_date::TIMESTAMP ASC, w.id ASC + LIMIT 100 + " + codec => json + } +} + +filter { + mutate { + remove_field => ["@version", "@timestamp"] + } + json { + source => "abstract_inverted_index" + target => "abstract_inverted_index_parsed" + } + ruby { + code => ' + abstract_inverted_index = event.get("abstract_inverted_index_parsed") + if abstract_inverted_index + abstract_length = abstract_inverted_index.values.flatten.max + 1 + abstract_words = Array.new(abstract_length, "") + abstract_inverted_index.each do |word, positions| + positions.each do |position| + abstract_words[position] = word + end + end + abstract = abstract_words.join(" ") + event.set("abstract", abstract) + end + ' + } + mutate { + remove_field => ["abstract_inverted_index", "abstract_inverted_index_parsed"] + convert => { + "cited_by_count" => "integer" + "publication_year" => "integer" + } + } + jdbc_streaming { + jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" + jdbc_driver_class => "org.postgresql.Driver" + jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" + jdbc_user => "${ES_DB_USER}" + jdbc_password => "${ES_DB_PASSWORD}" + statement => " + SELECT + a.id AS author_id, + wa.author_position, + a.display_name AS author_name, + a.works_count AS author_works_count, + a.cited_by_count AS author_cited_by_count, + wa.institution_id + FROM openalex.works_authorships wa + JOIN openalex.authors a ON wa.author_id = a.id + WHERE wa.work_id = :work_id + ORDER BY wa.author_position ASC + " + parameters => { "work_id" => "work_id" } + target => "authors" + } +} + +output { + stdout { codec => json } + elasticsearch { + hosts => ["${ES_NODE}"] + index => "denormalized_works_test" + document_id => "%{[work_id]}" + doc_as_upsert => true + } +} \ No newline at end of file diff --git a/docker-compose-es.yml b/docker-compose-es.yml index 6d946a403..484c8c81a 100644 --- a/docker-compose-es.yml +++ b/docker-compose-es.yml @@ -4,7 +4,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" + - "ES_JAVA_OPTS=-Xms3g -Xmx3g" ports: - "9200:9200" volumes: @@ -19,6 +19,8 @@ services: volumes: - ./desci-elastic/logstash.conf:/usr/share/logstash/pipeline/logstash.conf - ./local-data/logstash/drivers:/opt/logstash/drivers + environment: + - "LS_JAVA_OPTS=-Xms3g -Xmx3g" env_file: - .env extra_hosts: From a80207d33daa7576888357d22e8424ac65ba18ab Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Thu, 25 Jul 2024 08:34:01 +0000 Subject: [PATCH 11/25] add orcid to denorm works import script --- desci-elastic/logstash-denorm-works-authorships.conf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/desci-elastic/logstash-denorm-works-authorships.conf b/desci-elastic/logstash-denorm-works-authorships.conf index b58d62c8c..35e97045a 100644 --- a/desci-elastic/logstash-denorm-works-authorships.conf +++ b/desci-elastic/logstash-denorm-works-authorships.conf @@ -74,7 +74,8 @@ filter { a.display_name AS author_name, a.works_count AS author_works_count, a.cited_by_count AS author_cited_by_count, - wa.institution_id + wa.institution_id, + a.orcid FROM openalex.works_authorships wa JOIN openalex.authors a ON wa.author_id = a.id WHERE wa.work_id = :work_id @@ -89,7 +90,7 @@ output { stdout { codec => json } elasticsearch { hosts => ["${ES_NODE}"] - index => "denormalized_works_test" + index => "denormalized_works_test2" document_id => "%{[work_id]}" doc_as_upsert => true } From d926dec58586c0dd188f0aea8ef0b5405b48d2d7 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Thu, 25 Jul 2024 11:45:57 +0000 Subject: [PATCH 12/25] local elasticsearch stack setup --- ...m-works-authorships.conf => logstash.conf} | 1 + .../src/controllers/search/multiQuery.ts | 2 +- desci-server/src/controllers/search/query.ts | 3 +- desci-server/src/elasticSearchClient.ts | 15 ++++++--- docker-compose-es.yml | 32 +++++++++---------- 5 files changed, 30 insertions(+), 23 deletions(-) rename desci-elastic/{logstash-denorm-works-authorships.conf => logstash.conf} (96%) diff --git a/desci-elastic/logstash-denorm-works-authorships.conf b/desci-elastic/logstash.conf similarity index 96% rename from desci-elastic/logstash-denorm-works-authorships.conf rename to desci-elastic/logstash.conf index 35e97045a..aa484f804 100644 --- a/desci-elastic/logstash-denorm-works-authorships.conf +++ b/desci-elastic/logstash.conf @@ -1,3 +1,4 @@ +# Imports a denormalized works table including the authors table and the works_authorships join table input { jdbc { jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index 5eff8525b..ce5820721 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -89,7 +89,7 @@ export const multiQuery = async ( data: hits.hits, }); } catch (error) { - logger.error({ fn: 'Elastic search query failed', error }); + logger.error({ error }, 'Elastic search multi query failed'); return res.status(500).json({ ok: false, error: 'An error occurred while searching', diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index 571c336c6..5bbd145ba 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -82,7 +82,6 @@ export const singleQuery = async ( size: perPage, }, }); - // debugger; logger.info({ fn: 'Elastic search query executed successfully' }); // return res.status(200).send({ esQuery, resp }); @@ -94,7 +93,7 @@ export const singleQuery = async ( data: hits.hits, }); } catch (error) { - logger.error({ fn: 'Elastic search query failed', error }); + logger.error({ error }, 'Elastic search query failed'); return res.status(500).json({ ok: false, error: 'An error occurred while searching', diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts index 01f4bc742..5d6c22b9c 100644 --- a/desci-server/src/elasticSearchClient.ts +++ b/desci-server/src/elasticSearchClient.ts @@ -8,13 +8,20 @@ if (!esNodeUrl || !esUser || !esPw) { console.error('Missing environment variables for ElasticSearch'); } +const esAuthConfig = !esNodeUrl.includes('host.docker.internal') + ? { + // Auth unnecessary if running local ES node + auth: { + username: esUser, + password: esPw, + }, + } + : {}; + export const elasticClient = new Client({ node: esNodeUrl, - auth: { - username: esUser, - password: esPw, - }, + ...esAuthConfig, tls: { rejectUnauthorized: false, // Temporary }, diff --git a/docker-compose-es.yml b/docker-compose-es.yml index 484c8c81a..b5be8b96b 100644 --- a/docker-compose-es.yml +++ b/docker-compose-es.yml @@ -4,7 +4,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - "ES_JAVA_OPTS=-Xms3g -Xmx3g" + - "ES_JAVA_OPTS=-Xms3g -Xmx8g" ports: - "9200:9200" volumes: @@ -12,21 +12,21 @@ services: extra_hosts: - "host.docker.internal:host-gateway" - logstash: - build: - context: . - dockerfile: ./desci-elastic/Dockerfile-logstash - volumes: - - ./desci-elastic/logstash.conf:/usr/share/logstash/pipeline/logstash.conf - - ./local-data/logstash/drivers:/opt/logstash/drivers - environment: - - "LS_JAVA_OPTS=-Xms3g -Xmx3g" - env_file: - - .env - extra_hosts: - - "host.docker.internal:host-gateway" - depends_on: - - elasticsearch + # logstash: + # build: + # context: . + # dockerfile: ./desci-elastic/Dockerfile-logstash + # volumes: + # - ./desci-elastic/logstash.conf:/usr/share/logstash/pipeline/logstash.conf + # - ./local-data/logstash/drivers:/opt/logstash/drivers + # environment: + # - "LS_JAVA_OPTS=-Xms3g -Xmx4g" + # env_file: + # - .env + # extra_hosts: + # - "host.docker.internal:host-gateway" + # depends_on: + # - elasticsearch kibana: image: docker.elastic.co/kibana/kibana:8.14.3 From da73fd91e5dbab97b2df6c2b6a7733837557b48a Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Fri, 26 Jul 2024 11:56:26 +0000 Subject: [PATCH 13/25] multi query functional for test denormalized workks+authors index --- .../src/controllers/search/multiQuery.ts | 4 +++- .../src/services/ElasticSearchService.ts | 22 +++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index ce5820721..a87aec7a2 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -59,12 +59,14 @@ export const multiQuery = async ( }); } + const hardcodedMultiIndex = 'denormalized_works_test2'; + const esQueries = validEntityQueries.map((q) => { const [entity, query] = Object.entries(q)[0]; return buildMultiMatchQuery(query, entity); }); const primaryEntity = Object.keys(validEntityQueries[0])[0]; - const esSort = buildSortQuery(primaryEntity, sortType, sortOrder); + const esSort = buildSortQuery(hardcodedMultiIndex, sortType, sortOrder); const esBoolQuery = buildBoolQuery(esQueries); try { diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index cd1b8898f..7f0ce03bf 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -1,11 +1,21 @@ -export const VALID_ENTITIES = ['authors', 'concepts', 'institutions', 'publishers', 'sources', 'topics', 'works']; +export const VALID_ENTITIES = [ + 'authors', + 'concepts', + 'institutions', + 'publishers', + 'sources', + 'topics', + 'works', + 'denormalized_works_test2', +]; /** * Ordered from most relevant to least relevant */ export const RELEVANT_FIELDS = { works: ['title', 'abstract', 'doi'], - authors: ['display_name', 'orcid', 'last_known_institution'], + authors: ['author.display_name', 'author.orcid', 'author.last_known_institution'], + // authors: ['display_name', 'orcid', 'last_known_institution'], }; // abstract_inverted_index @@ -29,6 +39,14 @@ const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) updated_date: (order) => [{ updated_date: { order, missing: '_last' } }], relevance: () => [], }, + denormalized_works_test2: { + publication_year: (order) => [{ publication_year: { order, missing: '_last' } }], + publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], + cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], + title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], + author_name: (order) => [{ 'authors.display_name.keyword': { order, missing: '_last' } }], + relevance: () => [{ publication_year: { order: 'desc', missing: '_last' } }], + }, }; export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { From a7a196d0d465e20074748f4e36b0c98b820683e5 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:37:52 +0000 Subject: [PATCH 14/25] fix multiquery functionality for good results --- desci-server/src/controllers/search/multiQuery.ts | 1 + desci-server/src/services/ElasticSearchService.ts | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index a87aec7a2..d560e5fe2 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -72,6 +72,7 @@ export const multiQuery = async ( try { logger.debug({ esQueries, esSort }, 'Executing query'); const { hits } = await elasticClient.search({ + index: hardcodedMultiIndex, body: { ...esBoolQuery, sort: esSort, diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index 7f0ce03bf..73ac2112b 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -14,8 +14,8 @@ export const VALID_ENTITIES = [ */ export const RELEVANT_FIELDS = { works: ['title', 'abstract', 'doi'], - authors: ['author.display_name', 'author.orcid', 'author.last_known_institution'], - // authors: ['display_name', 'orcid', 'last_known_institution'], + authors: ['display_name', 'orcid', 'last_known_institution'], + denorm_authors: ['authors.author_name', 'authors.orcid', 'authors.last_known_institution'], }; // abstract_inverted_index @@ -44,7 +44,7 @@ const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], - author_name: (order) => [{ 'authors.display_name.keyword': { order, missing: '_last' } }], + author_name: (order) => [{ 'authors.author_name.keyword': { order, missing: '_last' } }], relevance: () => [{ publication_year: { order: 'desc', missing: '_last' } }], }, }; @@ -75,7 +75,7 @@ export function buildBoolQuery(queries: any[]) { export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: number) { let fields = []; if (entity === 'works') fields = RELEVANT_FIELDS.works; - if (entity === 'authors') fields = RELEVANT_FIELDS.authors; + if (entity === 'authors') fields = RELEVANT_FIELDS.denorm_authors; return { multi_match: { query: query, From 93c579dc369080ee1fd19b07bb11341e4147e13f Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:17:01 +0000 Subject: [PATCH 15/25] save --- desci-server/src/controllers/search/multiQuery.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index d560e5fe2..cc2b95de6 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -5,7 +5,6 @@ import { logger as parentLogger } from '../../logger.js'; import { buildBoolQuery, buildMultiMatchQuery, - buildSimpleStringQuery, buildSortQuery, VALID_ENTITIES, } from '../../services/ElasticSearchService.js'; From 6dde809fcf50ebb67655edf29bf9a0529cc08d7d Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:48:21 +0000 Subject: [PATCH 16/25] deployment config, query function refinements --- desci-server/kubernetes/deployment_dev.yaml | 3 ++ desci-server/kubernetes/deployment_prod.yaml | 3 ++ .../kubernetes/deployment_staging.yaml | 3 ++ .../src/controllers/search/multiQuery.ts | 11 +++--- desci-server/src/controllers/search/query.ts | 39 ++++++++++++------- .../src/services/ElasticSearchService.ts | 25 +++++++++--- 6 files changed, 58 insertions(+), 26 deletions(-) diff --git a/desci-server/kubernetes/deployment_dev.yaml b/desci-server/kubernetes/deployment_dev.yaml index 3c083523b..ab583228d 100644 --- a/desci-server/kubernetes/deployment_dev.yaml +++ b/desci-server/kubernetes/deployment_dev.yaml @@ -90,6 +90,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}" export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}" + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export DEBUG_TEST=0; echo "appfinish"; {{- end -}} diff --git a/desci-server/kubernetes/deployment_prod.yaml b/desci-server/kubernetes/deployment_prod.yaml index 794d45908..b7f2db79e 100755 --- a/desci-server/kubernetes/deployment_prod.yaml +++ b/desci-server/kubernetes/deployment_prod.yaml @@ -90,6 +90,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}" export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}" + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export IGNORE_LINE=0; export DEBUG_TEST=0; echo "appfinish"; diff --git a/desci-server/kubernetes/deployment_staging.yaml b/desci-server/kubernetes/deployment_staging.yaml index bd3591b8f..81f187e2d 100644 --- a/desci-server/kubernetes/deployment_staging.yaml +++ b/desci-server/kubernetes/deployment_staging.yaml @@ -102,6 +102,9 @@ spec: export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }} export AUTOMATED_METADATA_API={{ .Data.AUTOMATED_METADATA_API }} export AUTOMATED_METADATA_API_KEY={{ .Data.AUTOMATED_METADATA_API_KEY }} + export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}" + export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}" + export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}" export DEBUG_TEST=0; echo "appfinish"; {{- end -}} diff --git a/desci-server/src/controllers/search/multiQuery.ts b/desci-server/src/controllers/search/multiQuery.ts index cc2b95de6..401a82dbf 100644 --- a/desci-server/src/controllers/search/multiQuery.ts +++ b/desci-server/src/controllers/search/multiQuery.ts @@ -6,6 +6,7 @@ import { buildBoolQuery, buildMultiMatchQuery, buildSortQuery, + DENORMALIZED_WORKS_INDEX, VALID_ENTITIES, } from '../../services/ElasticSearchService.js'; @@ -58,20 +59,18 @@ export const multiQuery = async ( }); } - const hardcodedMultiIndex = 'denormalized_works_test2'; - const esQueries = validEntityQueries.map((q) => { const [entity, query] = Object.entries(q)[0]; return buildMultiMatchQuery(query, entity); }); const primaryEntity = Object.keys(validEntityQueries[0])[0]; - const esSort = buildSortQuery(hardcodedMultiIndex, sortType, sortOrder); + const esSort = buildSortQuery(DENORMALIZED_WORKS_INDEX, sortType, sortOrder); const esBoolQuery = buildBoolQuery(esQueries); try { logger.debug({ esQueries, esSort }, 'Executing query'); const { hits } = await elasticClient.search({ - index: hardcodedMultiIndex, + index: DENORMALIZED_WORKS_INDEX, body: { ...esBoolQuery, sort: esSort, @@ -79,11 +78,11 @@ export const multiQuery = async ( size: perPage, }, }); - debugger; // + logger.info({ fn: 'Elastic search multi query executed successfully' }); return res.json({ - esQueries, + // esQueries, ok: true, total: hits.total, page, diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index 5bbd145ba..2ad610208 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -3,9 +3,16 @@ import { Request, Response } from 'express'; import { elasticClient } from '../../elasticSearchClient.js'; import { logger as parentLogger } from '../../logger.js'; -import { buildSimpleStringQuery, buildSortQuery, VALID_ENTITIES } from '../../services/ElasticSearchService.js'; +import { + buildBoolQuery, + buildMultiMatchQuery, + buildSimpleStringQuery, + buildSortQuery, + DENORMALIZED_WORKS_INDEX, + VALID_ENTITIES, +} from '../../services/ElasticSearchService.js'; -export interface SingleQuerySuccessResponse { +export interface SingleQuerySuccessResponse extends QueryDebuggingResponse { ok: true; page: number; perPage: number; @@ -19,7 +26,7 @@ export interface QueryDebuggingResponse { esSort?: any; } -export interface SingleQueryErrorResponse { +export interface SingleQueryErrorResponse extends QueryDebuggingResponse { ok: false; error: string; } @@ -36,17 +43,12 @@ interface QuerySearchBodyParams { export const singleQuery = async ( req: Request, - res: Response, + res: Response, ) => { - const { - query, - entity, - fuzzy, - sortType = 'relevance', - sortOrder, - page = 1, - perPage = 10, - }: QuerySearchBodyParams = req.body; + const { query, fuzzy, sortType = 'relevance', sortOrder, page = 1, perPage = 10 }: QuerySearchBodyParams = req.body; + + let { entity } = req.body; + const logger = parentLogger.child({ module: 'SEARCH::Query', query, @@ -57,7 +59,11 @@ export const singleQuery = async ( page, perPage, }); - + if (entity === 'works') { + logger.info({ entity }, `Entity is 'works', changing to denormalized works index: ${DENORMALIZED_WORKS_INDEX}`); + entity = DENORMALIZED_WORKS_INDEX; + } + // logger.trace({ fn: 'Executing elastic search query' }); if (!VALID_ENTITIES.includes(entity)) { @@ -67,7 +73,8 @@ export const singleQuery = async ( }); } - const esQuery = buildSimpleStringQuery(query, entity, fuzzy); + // const esQuery = buildSimpleStringQuery(query, entity, fuzzy); + const esQuery = buildMultiMatchQuery(query, 'works_single', fuzzy); const esSort = buildSortQuery(entity, sortType, sortOrder); try { @@ -86,6 +93,8 @@ export const singleQuery = async ( // return res.status(200).send({ esQuery, resp }); return res.json({ + esQuery, + esSort, ok: true, total: hits.total, page, diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index 73ac2112b..3a28789c3 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -1,3 +1,6 @@ +import { QueryDslTextQueryType } from '@elastic/elasticsearch/lib/api/types.js'; + +export const DENORMALIZED_WORKS_INDEX = 'denormalized_works_test_2024_08_01'; export const VALID_ENTITIES = [ 'authors', 'concepts', @@ -6,7 +9,7 @@ export const VALID_ENTITIES = [ 'sources', 'topics', 'works', - 'denormalized_works_test2', + DENORMALIZED_WORKS_INDEX, ]; /** @@ -16,6 +19,14 @@ export const RELEVANT_FIELDS = { works: ['title', 'abstract', 'doi'], authors: ['display_name', 'orcid', 'last_known_institution'], denorm_authors: ['authors.author_name', 'authors.orcid', 'authors.last_known_institution'], + works_single: [ + 'title^3', + 'abstract', + 'doi', + 'authors.author_name', + 'authors.orcid', + 'authors.last_known_institution', + ], }; // abstract_inverted_index @@ -30,7 +41,7 @@ const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], - relevance: () => [{ publication_year: { order: 'desc', missing: '_last' } }], + relevance: () => [], }, authors: { display_name: (order) => [{ 'display_name.keyword': { order, missing: '_last' } }], @@ -39,13 +50,13 @@ const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) updated_date: (order) => [{ updated_date: { order, missing: '_last' } }], relevance: () => [], }, - denormalized_works_test2: { + [DENORMALIZED_WORKS_INDEX]: { publication_year: (order) => [{ publication_year: { order, missing: '_last' } }], publication_date: (order) => [{ publication_date: { order, missing: '_last' } }], cited_by_count: (order) => [{ cited_by_count: { order, missing: '_last' } }], title: (order) => [{ 'title.keyword': { order, missing: '_last' } }], author_name: (order) => [{ 'authors.author_name.keyword': { order, missing: '_last' } }], - relevance: () => [{ publication_year: { order: 'desc', missing: '_last' } }], + relevance: () => [], }, }; @@ -76,11 +87,14 @@ export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: numb let fields = []; if (entity === 'works') fields = RELEVANT_FIELDS.works; if (entity === 'authors') fields = RELEVANT_FIELDS.denorm_authors; + if (entity === 'works_single') fields = RELEVANT_FIELDS.works_single; + + const type: QueryDslTextQueryType = 'best_fields'; return { multi_match: { query: query, fields: fields, - type: 'best_fields', + type, fuzziness: fuzzy || 'AUTO', }, }; @@ -95,6 +109,7 @@ export function buildSortQuery(entity: string, sortType?: string, sortOrder: Sor const sortFunction = entityConfig[sortType] || entityConfig['relevance'] || (() => []); const specificSort = sortFunction(sortOrder); + // return [...baseSort]; return [...specificSort, ...baseSort]; } From ca5799fd771b78138b40990433b04dc7bb0e276b Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Fri, 2 Aug 2024 18:06:11 +0000 Subject: [PATCH 17/25] work citation and author citation field boosting --- .../src/services/ElasticSearchService.ts | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index 3a28789c3..d4e44f3d4 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -20,7 +20,7 @@ export const RELEVANT_FIELDS = { authors: ['display_name', 'orcid', 'last_known_institution'], denorm_authors: ['authors.author_name', 'authors.orcid', 'authors.last_known_institution'], works_single: [ - 'title^3', + 'title^1.25', 'abstract', 'doi', 'authors.author_name', @@ -60,6 +60,34 @@ const sortConfigs: { [entity: string]: { [sortType: string]: (order: SortOrder) }, }; +export function scoreBoostFunction(query: Record<'multi_match', MultiMatchQuery>) { + return { + function_score: { + query, + functions: [ + { + field_value_factor: { + field: 'cited_by_count', + factor: 1.5, + modifier: 'log1p', + missing: 0, + }, + }, + { + field_value_factor: { + field: 'authors.author_cited_by_count', + factor: 0.1, + modifier: 'log1p', + missing: 0, + }, + }, + ], + boost_mode: 'sum', + score_mode: 'sum', + }, + }; +} + export function buildSimpleStringQuery(query: string, entity: string, fuzzy?: number) { return { simple_query_string: { @@ -90,7 +118,7 @@ export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: numb if (entity === 'works_single') fields = RELEVANT_FIELDS.works_single; const type: QueryDslTextQueryType = 'best_fields'; - return { + const multiMatchQuery = { multi_match: { query: query, fields: fields, @@ -98,6 +126,9 @@ export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: numb fuzziness: fuzzy || 'AUTO', }, }; + + if (entity === 'works_single') return scoreBoostFunction(multiMatchQuery); + return multiMatchQuery; } export function buildSortQuery(entity: string, sortType?: string, sortOrder: SortOrder = 'desc'): SortField[] { @@ -130,3 +161,10 @@ export type IndexedAuthor = { updated_date: string; }; }; + +export interface MultiMatchQuery { + query: string; + fields: any[]; + type: 'best_fields'; + fuzziness: string | number; +} From f5496f6cc6e9729280ee323179e8eac766de74a6 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Fri, 2 Aug 2024 18:34:34 +0000 Subject: [PATCH 18/25] type fixes --- desci-server/src/controllers/search/query.ts | 5 ++--- .../src/services/ElasticSearchService.ts | 16 ++++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/desci-server/src/controllers/search/query.ts b/desci-server/src/controllers/search/query.ts index 2ad610208..a9cce97a4 100644 --- a/desci-server/src/controllers/search/query.ts +++ b/desci-server/src/controllers/search/query.ts @@ -78,9 +78,8 @@ export const singleQuery = async ( const esSort = buildSortQuery(entity, sortType, sortOrder); try { - // debugger; logger.debug({ esQuery, esSort }, 'Executing query'); - const { hits } = await elasticClient.search({ + const results = await elasticClient.search({ index: entity, body: { query: esQuery, @@ -89,8 +88,8 @@ export const singleQuery = async ( size: perPage, }, }); + const hits = results.hits; logger.info({ fn: 'Elastic search query executed successfully' }); - // return res.status(200).send({ esQuery, resp }); return res.json({ esQuery, diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index d4e44f3d4..a6a8743dd 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -1,4 +1,8 @@ -import { QueryDslTextQueryType } from '@elastic/elasticsearch/lib/api/types.js'; +import { + QueryDslFunctionBoostMode, + QueryDslQueryContainer, + QueryDslTextQueryType, +} from '@elastic/elasticsearch/lib/api/types.js'; export const DENORMALIZED_WORKS_INDEX = 'denormalized_works_test_2024_08_01'; export const VALID_ENTITIES = [ @@ -82,8 +86,8 @@ export function scoreBoostFunction(query: Record<'multi_match', MultiMatchQuery> }, }, ], - boost_mode: 'sum', - score_mode: 'sum', + boost_mode: 'sum' as QueryDslFunctionBoostMode, + score_mode: 'sum' as QueryDslFunctionBoostMode, }, }; } @@ -127,8 +131,8 @@ export function buildMultiMatchQuery(query: string, entity: string, fuzzy?: numb }, }; - if (entity === 'works_single') return scoreBoostFunction(multiMatchQuery); - return multiMatchQuery; + if (entity === 'works_single') return scoreBoostFunction(multiMatchQuery) as QueryDslQueryContainer; + return multiMatchQuery as QueryDslQueryContainer; } export function buildSortQuery(entity: string, sortType?: string, sortOrder: SortOrder = 'desc'): SortField[] { @@ -165,6 +169,6 @@ export type IndexedAuthor = { export interface MultiMatchQuery { query: string; fields: any[]; - type: 'best_fields'; + type: QueryDslTextQueryType; fuzziness: string | number; } From e6642b6528c2ae35a12ccca5acb58fbfe3dcdd55 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:56:08 +0000 Subject: [PATCH 19/25] github actions failing to run test --- desci-server/src/services/ElasticSearchService.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/desci-server/src/services/ElasticSearchService.ts b/desci-server/src/services/ElasticSearchService.ts index a6a8743dd..7fc611f64 100644 --- a/desci-server/src/services/ElasticSearchService.ts +++ b/desci-server/src/services/ElasticSearchService.ts @@ -32,7 +32,6 @@ export const RELEVANT_FIELDS = { 'authors.last_known_institution', ], }; -// abstract_inverted_index type SortOrder = 'asc' | 'desc'; type SortField = { [field: string]: { order: SortOrder; missing?: string } }; From 4011dbd963fd8f723b810829b472622dff0fc58c Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:04:29 +0000 Subject: [PATCH 20/25] compose perms in gh actions edit --- .github/workflows/models-build-and-test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/models-build-and-test.yaml b/.github/workflows/models-build-and-test.yaml index 172b2156c..63d25cbb0 100644 --- a/.github/workflows/models-build-and-test.yaml +++ b/.github/workflows/models-build-and-test.yaml @@ -35,6 +35,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info From 9fc621f81c3fc73a1a023098ef3df95789af5aac Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:15:20 +0000 Subject: [PATCH 21/25] compose perms --- .github/workflows/build-server.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-server.yaml b/.github/workflows/build-server.yaml index ebef63f23..a064e4854 100644 --- a/.github/workflows/build-server.yaml +++ b/.github/workflows/build-server.yaml @@ -52,6 +52,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info From 356194f5219328cea47e02432675f1879d7c359b Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:27:25 +0000 Subject: [PATCH 22/25] fixed --- .github/workflows/build-and-test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-and-test.yaml b/.github/workflows/build-and-test.yaml index 956ee3061..ca44c8ba7 100644 --- a/.github/workflows/build-and-test.yaml +++ b/.github/workflows/build-and-test.yaml @@ -50,6 +50,7 @@ jobs: - name: Set up docker-compose run: | sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose sudo docker-compose --version docker info From ac7333021c601e7378296b20f66d73ca80c6e744 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:52:54 +0000 Subject: [PATCH 23/25] fix undefined crash --- desci-server/src/elasticSearchClient.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts index 5d6c22b9c..0dbb65f04 100644 --- a/desci-server/src/elasticSearchClient.ts +++ b/desci-server/src/elasticSearchClient.ts @@ -8,7 +8,7 @@ if (!esNodeUrl || !esUser || !esPw) { console.error('Missing environment variables for ElasticSearch'); } -const esAuthConfig = !esNodeUrl.includes('host.docker.internal') +const esAuthConfig = !esNodeUrl?.includes('host.docker.internal') ? { // Auth unnecessary if running local ES node auth: { From 0e04fa653d8e9c148bbba6edf2c2a6859e2a1bc1 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:10:40 +0000 Subject: [PATCH 24/25] prevent crash if elastic envs not set --- desci-server/src/elasticSearchClient.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts index 0dbb65f04..966c883a2 100644 --- a/desci-server/src/elasticSearchClient.ts +++ b/desci-server/src/elasticSearchClient.ts @@ -18,11 +18,13 @@ const esAuthConfig = !esNodeUrl?.includes('host.docker.internal') } : {}; -export const elasticClient = new Client({ - node: esNodeUrl, - - ...esAuthConfig, - tls: { - rejectUnauthorized: false, // Temporary - }, -}); +export const elasticClient = + esNodeUrl && esUser && esPw + ? new Client({ + node: esNodeUrl, + ...esAuthConfig, + tls: { + rejectUnauthorized: false, // Temporary + }, + }) + : ({} as any); From 59b50747e5b96052ed6bda72cbc0f12eac473da1 Mon Sep 17 00:00:00 2001 From: kadami <86646883+kadamidev@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:11:49 +0000 Subject: [PATCH 25/25] prevent crash if elastic search not configured --- desci-server/src/elasticSearchClient.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/desci-server/src/elasticSearchClient.ts b/desci-server/src/elasticSearchClient.ts index 966c883a2..aa2a6aafa 100644 --- a/desci-server/src/elasticSearchClient.ts +++ b/desci-server/src/elasticSearchClient.ts @@ -8,15 +8,16 @@ if (!esNodeUrl || !esUser || !esPw) { console.error('Missing environment variables for ElasticSearch'); } -const esAuthConfig = !esNodeUrl?.includes('host.docker.internal') - ? { - // Auth unnecessary if running local ES node - auth: { - username: esUser, - password: esPw, - }, - } - : {}; +const esAuthConfig = + !esNodeUrl?.includes('host.docker.internal') && esUser && esPw + ? { + // Auth unnecessary if running local ES node + auth: { + username: esUser, + password: esPw, + }, + } + : {}; export const elasticClient = esNodeUrl && esUser && esPw