Skip to content

Commit

Permalink
Merge pull request #626 from desci-labs/fixes
Browse files Browse the repository at this point in the history
Local dev fix and ES tinkering
  • Loading branch information
kadamidev authored Nov 11, 2024
2 parents b41cf9b + c4e7932 commit 7337761
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 17 deletions.
2 changes: 1 addition & 1 deletion desci-server/scripts/be-node-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ apt-get install bash

# Exit on error
set -e
./desci-server/scripts/wait-for-it.sh $PG_HOST:5433 --timeout=5 --strict -- echo "postgres up and running"
./desci-server/scripts/wait-for-it.sh $PG_HOST:$PG_PORT --timeout=5 --strict -- echo "postgres up and running"

# npm run migration:run
# npm run seed:run
Expand Down
157 changes: 141 additions & 16 deletions desci-server/src/services/ElasticSearchService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,25 +160,31 @@ export function createFunctionScoreQuery(query: QueryDslQueryContainer, entity:
}

export function createAutocompleteFunctionScoreQuery(query: string): QueryDslQueryContainer {
// Use these as tie breakers, low weights/factors compared to the text matches
// Use these as tie breakers, small multipliers to slightly boost more relevant work
const boostFunctions: QueryDslFunctionScoreContainer[] = [
{
filter: {
term: { entity_type: 'work' },
},
weight: 0.9,
},
{
filter: { range: { cited_by_count: { gte: 1 } } },
field_value_factor: {
field: 'cited_by_count',
factor: 0.001,
modifier: 'log1p',
},
weight: 1,
weight: 1.1,
},
{
filter: { range: { works_count: { gte: 1 } } },
field_value_factor: {
field: 'works_count',
factor: 0.0005,
factor: 1,
modifier: 'log1p',
},
weight: 0.5,
weight: 1.05,
},
];

Expand Down Expand Up @@ -282,19 +288,13 @@ export function createAutocompleteFunctionScoreQuery(query: string): QueryDslQue
minimum_should_match: 1,
},
},
// Text matches (lower priority)
// Prefix matches
{
multi_match: {
query: query,
fields: [
'title^3',
'description^2',
'publisher^2',
'subfield_display_name^2',
'institution_data.display_name^2',
],
fields: ['title'],
type: 'phrase_prefix',
boost: 10,
boost: 200,
},
},
];
Expand All @@ -307,8 +307,132 @@ export function createAutocompleteFunctionScoreQuery(query: string): QueryDslQue
const functionScoreQuery: QueryDslFunctionScoreQuery = {
query: { bool: boolQuery },
functions: boostFunctions,
boost_mode: 'multiply' as QueryDslFunctionBoostMode,
score_mode: 'sum' as QueryDslFunctionScoreMode,
min_score: 0.1,
};

return { function_score: functionScoreQuery };
}

function createEnhancedWorksQueryV2(query: string): QueryDslQueryContainer {
const currentYear = new Date().getFullYear();

const cleanQuery = query.toLowerCase();

const shouldClauses: QueryDslQueryContainer[] = [
// Exact matches (highest priority)
{
bool: {
should: [
{
match_phrase: {
title: {
query: cleanQuery,
boost: 30,
analyzer: 'standard_analyzer',
},
},
},
{
term: {
doi: {
value: cleanQuery,
boost: 100,
},
},
},
],
minimum_should_match: 1,
},
},

// High-precision matches (80% threshold)
{
bool: {
should: [
{
match: {
title: {
query: cleanQuery,
minimum_should_match: '70%',
boost: 20,
analyzer: 'standard_analyzer',
},
},
},
{
match: {
abstract: {
query: cleanQuery,
minimum_should_match: '80%',
boost: 10,
analyzer: 'standard_analyzer',
},
},
},
],
minimum_should_match: 1,
},
},
];

const functionScoreQuery: QueryDslFunctionScoreQuery = {
query: { bool: { should: shouldClauses, minimum_should_match: 1 } },
functions: [
// Citation impact as tiebreaker
{
filter: { range: { cited_by_count: { gte: 1 } } },
field_value_factor: {
field: 'cited_by_count',
factor: 1,
modifier: 'log1p',
},
weight: 25,
},
// Publication year as tiebreaker
{
linear: {
publication_year: {
origin: currentYear.toString(),
scale: '25',
offset: '3',
decay: 0.5,
},
},
weight: 5,
},
// // Boost for articles and preprints
{
filter: {
bool: {
should: [{ term: { type: 'article' } }, { term: { type: 'preprint' } }],
},
},
weight: 1,
},
// // Venue quality as tiebreaker
{
filter: {
range: { 'best_locations.works_count': { gte: 1000 } },
},
field_value_factor: {
field: 'best_locations.works_count',
factor: 1,
modifier: 'log1p',
},
weight: 25,
},
// Language preference
{
filter: {
term: { language: 'en' },
},
weight: 2,
},
],
score_mode: 'sum' as QueryDslFunctionScoreMode,
boost_mode: 'sum' as QueryDslFunctionBoostMode,
score_mode: 'multiply' as QueryDslFunctionScoreMode,
min_score: 0.1,
};

Expand Down Expand Up @@ -366,7 +490,7 @@ function createEnhancedWorksQuery(query: string): QueryDslQueryContainer {
abstract: {
query: cleanQuery,
minimum_should_match: '80%',
boost: 60,
boost: 40,
analyzer: 'standard_analyzer',
},
},
Expand Down Expand Up @@ -651,7 +775,8 @@ export function buildMultiMatchQuery(
}

if (entity === 'works' || entity === 'works_single' || entity === 'works_opt') {
return createEnhancedWorksQuery(query);
return createEnhancedWorksQueryV2(query);
// return createEnhancedWorksQuery(query);
}

const fields = getRelevantFields(entity);
Expand Down

0 comments on commit 7337761

Please sign in to comment.