Skip to content

Commit

Permalink
Merge pull request #442 from desci-labs/es-search-api
Browse files Browse the repository at this point in the history
ES Search API
  • Loading branch information
hubsmoke authored Aug 8, 2024
2 parents 346d050 + 59b5074 commit f4483e6
Show file tree
Hide file tree
Showing 22 changed files with 875 additions and 5 deletions.
15 changes: 14 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,17 @@ CROSSREF_NOTIFY_ENDPOINT=endpoint

# Automated metadata
AUTOMATED_METADATA_API=http://host.docker.internal:5005
AUTOMATED_METADATA_API_KEY=
AUTOMATED_METADATA_API_KEY=

# Elastic Search, required for /v1/search endpoints
ELASTIC_SEARCH_NODE_URL=
ELASTIC_SEARCH_USER=
ELASTIC_SEARCH_PW=

# Elastic search local dev node configuration
ES_NODE=http://host.docker.internal:9200
ES_DB_HOST=
ES_DB_PORT=
ES_DB_NAME=
ES_DB_USER=
ES_DB_PASSWORD=
1 change: 1 addition & 0 deletions .github/workflows/build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:
- name: Set up docker-compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
sudo docker-compose --version
docker info
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
- name: Set up docker-compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
sudo docker-compose --version
docker info
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/models-build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
- name: Set up docker-compose
run: |
sudo curl -L "https://github.com/docker/compose/releases/download/v2.18.1/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
sudo docker-compose --version
docker info
Expand Down
15 changes: 15 additions & 0 deletions desci-elastic/Dockerfile-logstash
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM docker.elastic.co/logstash/logstash:8.14.3

USER root

# Install curl
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*

# Copy the initialization script
COPY desci-elastic/init-logstash.sh /usr/local/bin/init-logstash.sh
RUN chmod +x /usr/local/bin/init-logstash.sh

USER logstash

# Set the entrypoint to the initialization script
ENTRYPOINT ["/usr/local/bin/init-logstash.sh"]
38 changes: 38 additions & 0 deletions desci-elastic/init-logstash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

set -e

# Configuration
DRIVER_URL="https://jdbc.postgresql.org/download/postgresql-42.7.3.jar"
DRIVER_DIR="/opt/logstash/drivers"
DRIVER_FILE="$DRIVER_DIR/postgresql-42.7.3.jar"

# Ensure the driver directory exists
mkdir -p "$DRIVER_DIR"


download_driver() {
echo "Downloading PostgreSQL JDBC driver..."
curl -# -o "$DRIVER_FILE" "$DRIVER_URL"
chmod 644 "$DRIVER_FILE"
echo "Driver downloaded and permissions set."
}

# Check if driver exists and download if necessary
if [ -f "$DRIVER_FILE" ]; then
echo "PostgreSQL JDBC driver already exists."
else
download_driver
fi

# Verify the driver file
if [ ! -f "$DRIVER_FILE" ]; then
echo "Error: Failed to download or locate the PostgreSQL JDBC driver."
exit 1
fi

# Ensure correct permissions on the driver file
chmod 644 "$DRIVER_FILE"

# Start Logstash with the provided pipeline configuration
exec logstash -f /usr/share/logstash/pipeline/logstash.conf
62 changes: 62 additions & 0 deletions desci-elastic/logstash-authors.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
input {
jdbc {
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar"
jdbc_driver_class => "org.postgresql.Driver"
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}"
jdbc_user => "${ES_DB_USER}"
jdbc_password => "${ES_DB_PASSWORD}"
statement => "
SELECT
id::text,
orcid::text,
display_name::text,
display_name_alternatives::text,
works_count::text,
cited_by_count::text,
last_known_institution::text,
works_api_url::text,
updated_date::text
FROM openalex.authors
WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP)
ORDER BY updated_date ASC, id ASC
LIMIT 1000
"
use_column_value => true
tracking_column => "updated_date"
tracking_column_type => "timestamp"
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run"
jdbc_paging_enabled => true
jdbc_page_size => 1000
}
}

filter {
mutate {
remove_field => ["@version", "@timestamp"]
}
json {
source => "display_name_alternatives"
target => "display_name_alternatives"
skip_on_invalid_json => true
}
json {
source => "last_known_institution"
target => "last_known_institution"
skip_on_invalid_json => true
}
mutate {
convert => {
"works_count" => "integer"
"cited_by_count" => "integer"
}
}
}

output {
elasticsearch {
hosts => ["${ES_NODE}"]
index => "authors"
document_id => "%{id}"
doc_as_upsert => true
}
}
37 changes: 37 additions & 0 deletions desci-elastic/logstash-works-authorships.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
input {
jdbc {
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar"
jdbc_driver_class => "org.postgresql.Driver"
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}"
jdbc_user => "${ES_DB_USER}"
jdbc_password => "${ES_DB_PASSWORD}"
statement => "
SELECT
work_id,
author_id,
author_position,
raw_affiliation_string,
institution_id
FROM openalex.works_authorships
ORDER BY work_id ASC
LIMIT 10000
"
jdbc_paging_enabled => true
jdbc_page_size => 1000
}
}

filter {
mutate {
remove_field => ["@version", "@timestamp"]
}
}

output {
elasticsearch {
hosts => ["${ES_NODE}"]
index => "works_authorships"
document_id => "%{work_id}-%{author_id}"
doc_as_upsert => true
}
}
64 changes: 64 additions & 0 deletions desci-elastic/logstash-works.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
input {
jdbc {
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar"
jdbc_driver_class => "org.postgresql.Driver"
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}"
jdbc_user => "${ES_DB_USER}"
jdbc_password => "${ES_DB_PASSWORD}"
statement => "
SELECT
id::TEXT,
orcid::TEXT,
display_name::TEXT,
display_name_alternatives::TEXT,
works_count::TEXT,
cited_by_count::TEXT,
last_known_institution::TEXT,
works_api_url::TEXT,
updated_date::TEXT
FROM openalex.authors
WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP)
ORDER BY updated_date ASC, id ASC
LIMIT 1000
"
use_column_value => true
tracking_column => "updated_date"
tracking_column_type => "timestamp"
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run"
jdbc_paging_enabled => true
jdbc_page_size => 1000
codec => json
}
}

filter {
mutate {
remove_field => ["@version", "@timestamp"]
}
json {
source => "display_name_alternatives"
target => "display_name_alternatives"
skip_on_invalid_json => true
}
json {
source => "last_known_institution"
target => "last_known_institution"
skip_on_invalid_json => true
}
mutate {
convert => {
"works_count" => "integer"
"cited_by_count" => "integer"
}
}
}

output {
stdout { codec => json }
elasticsearch {
hosts => ["${ES_NODE}"]
index => "authors"
document_id => "%{id}"
doc_as_upsert => true
}
}
98 changes: 98 additions & 0 deletions desci-elastic/logstash.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Imports a denormalized works table including the authors table and the works_authorships join table
input {
jdbc {
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar"
jdbc_driver_class => "org.postgresql.Driver"
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}"
jdbc_user => "${ES_DB_USER}"
jdbc_password => "${ES_DB_PASSWORD}"
jdbc_paging_enabled => true
jdbc_page_size => 100
use_column_value => true
tracking_column => "publication_date"
tracking_column_type => "timestamp"
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run"
statement => "
SELECT
w.id::TEXT AS work_id,
w.doi::TEXT,
w.title::TEXT,
w.publication_year::TEXT,
w.type::TEXT,
w.cited_by_count::TEXT AS cited_by_count,
w.abstract_inverted_index::TEXT as abstract_inverted_index,
w.publication_date::TIMESTAMP AS publication_date
FROM openalex.works w
WHERE w.publication_date::TIMESTAMP > CAST(:sql_last_value AS TIMESTAMP)
ORDER BY w.publication_date::TIMESTAMP ASC, w.id ASC
LIMIT 100
"
codec => json
}
}

filter {
mutate {
remove_field => ["@version", "@timestamp"]
}
json {
source => "abstract_inverted_index"
target => "abstract_inverted_index_parsed"
}
ruby {
code => '
abstract_inverted_index = event.get("abstract_inverted_index_parsed")
if abstract_inverted_index
abstract_length = abstract_inverted_index.values.flatten.max + 1
abstract_words = Array.new(abstract_length, "")
abstract_inverted_index.each do |word, positions|
positions.each do |position|
abstract_words[position] = word
end
end
abstract = abstract_words.join(" ")
event.set("abstract", abstract)
end
'
}
mutate {
remove_field => ["abstract_inverted_index", "abstract_inverted_index_parsed"]
convert => {
"cited_by_count" => "integer"
"publication_year" => "integer"
}
}
jdbc_streaming {
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar"
jdbc_driver_class => "org.postgresql.Driver"
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}"
jdbc_user => "${ES_DB_USER}"
jdbc_password => "${ES_DB_PASSWORD}"
statement => "
SELECT
a.id AS author_id,
wa.author_position,
a.display_name AS author_name,
a.works_count AS author_works_count,
a.cited_by_count AS author_cited_by_count,
wa.institution_id,
a.orcid
FROM openalex.works_authorships wa
JOIN openalex.authors a ON wa.author_id = a.id
WHERE wa.work_id = :work_id
ORDER BY wa.author_position ASC
"
parameters => { "work_id" => "work_id" }
target => "authors"
}
}

output {
stdout { codec => json }
elasticsearch {
hosts => ["${ES_NODE}"]
index => "denormalized_works_test2"
document_id => "%{[work_id]}"
doc_as_upsert => true
}
}
3 changes: 3 additions & 0 deletions desci-server/kubernetes/deployment_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ spec:
export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }}
export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}"
export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}"
export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}"
export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}"
export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}"
export DEBUG_TEST=0;
echo "appfinish";
{{- end -}}
Expand Down
3 changes: 3 additions & 0 deletions desci-server/kubernetes/deployment_prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ spec:
export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }}
export AUTOMATED_METADATA_API="{{ .Data.AUTOMATED_METADATA_API }}"
export AUTOMATED_METADATA_API_KEY="{{ .Data.AUTOMATED_METADATA_API_KEY }}"
export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}"
export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}"
export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}"
export IGNORE_LINE=0;
export DEBUG_TEST=0;
echo "appfinish";
Expand Down
3 changes: 3 additions & 0 deletions desci-server/kubernetes/deployment_staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ spec:
export CROSSREF_NOTIFY_ENDPOINT={{ .Data.CROSSREF_NOTIFY_ENDPOINT }}
export AUTOMATED_METADATA_API={{ .Data.AUTOMATED_METADATA_API }}
export AUTOMATED_METADATA_API_KEY={{ .Data.AUTOMATED_METADATA_API_KEY }}
export ELASTIC_SEARCH_NODE_URL="{{ .Data.ELASTIC_SEARCH_NODE_URL }}"
export ELASTIC_SEARCH_USER="{{ .Data.ELASTIC_SEARCH_USER }}"
export ELASTIC_SEARCH_PW="{{ .Data.ELASTIC_SEARCH_PW }}"
export DEBUG_TEST=0;
echo "appfinish";
{{- end -}}
Expand Down
Loading

0 comments on commit f4483e6

Please sign in to comment.