Skip to content

Commit

Permalink
Merge branch 'master' into grobid-0.8.1
Browse files Browse the repository at this point in the history
# Conflicts:
#	build.gradle
  • Loading branch information
lfoppiano committed Jan 4, 2025
2 parents 8b73624 + 2a849b9 commit ee4ff92
Show file tree
Hide file tree
Showing 15 changed files with 267 additions and 70 deletions.
15 changes: 11 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ ARG BUILD_VERSION=0.0.6
# -------------
# builder image
# -------------
FROM openjdk:8u275-jdk as builder
#FROM openjdk:8u275-jdk as builder
FROM openjdk:17-jdk-slim as builder

USER root
ENV LANG="en_US.UTF-8" \
Expand All @@ -21,8 +22,8 @@ RUN apt-get update -y && \
WORKDIR /opt/

# install GROBID
RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.7.2.zip
RUN unzip -o 0.7.2.zip && mv grobid-* grobid
RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.8.1.zip
RUN unzip -o 0.8.1.zip && mv grobid-* grobid

WORKDIR /opt/grobid

Expand All @@ -34,7 +35,7 @@ RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64
RUN rm -rf ../0.7.2.zip
RUN rm -rf ../0.8.1.zip

# cleaning DeLFT models
RUN rm -rf grobid-home/models/*-BidLSTM_CRF*
Expand Down Expand Up @@ -77,6 +78,12 @@ RUN apt-get update && \
RUN apt-get update -y && \
apt-get clean all -y

# Add Tini
ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]

WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .

Expand Down
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ buildscript {
maven {
url 'https://plugins.gradle.org/m2/'
}
maven { url "https://grobid.s3.eu-west-1.amazonaws.com/repo/" }
}
dependencies {
classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0'
Expand Down Expand Up @@ -59,7 +60,7 @@ dependencies {
//exclude(module: 'log4j-over-slf4j')
exclude(group: 'ch.qos.logback', module: 'logback-classic')
}
implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.0') {
implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.1') {
//exclude(module: 'log4j-over-slf4j')
exclude(group: 'ch.qos.logback', module: 'logback-classic')
}
Expand Down
8 changes: 7 additions & 1 deletion doc/build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,10 @@ Prometheus metrics (e.g. for Graphana monitoring) are available at http://localh
Creating a new Knowledge Base version
*************************************

The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP <https://github.com/kermitt2/grisp>`_, see `https://github.com/kermitt2/grisp <https://github.com/kermitt2/grisp>`_.
The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP <https://github.com/kermitt2/grisp>`_.

The files generated by GRISP (see `listing all necessary files <https://github.com/kermitt2/grisp?tab=readme-ov-file#final-hierarchy-of-files>`_) should be used via the configuration:

- ``dataDirectory`` in the files ``wikipedia-XY.yml`` (with XY equal to the language, e.g. ``en``, ``fr``) for the Wikipedia related knowledge base. Note: The ``XYwiki-latest-pages-articles-multistream.xml.bz2`` can be left compressed

- ``dataDirectory`` in the file ``kb.yml`` for the Wikidata knowledge base (db-kb)
82 changes: 82 additions & 0 deletions doc/restAPI.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1295,3 +1295,85 @@ Or in case of issues:
"ok": "false",
"message": "The customisation already exists."
}

Data and statistics API
***********************

GET /data
^^^^^^^^^

Retrieve information about the loaded data, showing the number of concept per Knowledge base.

(1) Example response

Here a sample of the response
::
{
"upperKnowledgeBaseStatisticsCount": {
"Concepts": 113276007,
"Labels": 113331134,
"Statements": 112505569
},
"lowerKnowledgeBaseStatisticsCount": {
"de": {
"Pages": 0,
"Articles": 0
},
"hi": {
"Pages": 0,
"Articles": 0
},
"ru": {
"Pages": 0,
"Articles": 0
},
"sv": {
"Pages": 0,
"Articles": 0
},
"pt": {
"Pages": 0,
"Articles": 0
},
"en": {
"Pages": 20279663,
"Articles": 6649343
},
"it": {
"Pages": 0,
"Articles": 0
},
"fr": {
"Pages": 0,
"Articles": 0
},
"bn": {
"Pages": 0,
"Articles": 0
},
"es": {
"Pages": 0,
"Articles": 0
},
"zh": {
"Pages": 0,
"Articles": 0
},
"ar": {
"Pages": 0,
"Articles": 0
},
"uk": {
"Pages": 0,
"Articles": 0
},
"ja": {
"Pages": 0,
"Articles": 0
},
"fa": {
"Pages": 0,
"Articles": 0
}
}
}
86 changes: 53 additions & 33 deletions src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,41 +16,61 @@
import java.util.List;

/**
* Represent the language specific resources of the Knowledge Base, e.g. a
* Wikipedia instance, including corresponding word and entity embeddings.
*
* Represent the language specific resources of the Knowledge Base, e.g. a
* Wikipedia instance, including corresponding word and entity embeddings.
*
*/
public class LowerKnowledgeBase {

protected static final Logger LOGGER = LoggerFactory.getLogger(LowerKnowledgeBase.class);

private KBLowerEnvironment env = null;
private int wikipediaArticleCount = -1;
private int wikipediaPageCount = -1;

public enum Direction {
In,
In,
Out
}

/**
* Initialises a newly created Wikipedia according to the given configuration.
*
* Initialises a newly created Wikipedia according to the given configuration.
*
*/
public LowerKnowledgeBase(NerdConfig conf) {
this.env = new KBLowerEnvironment(conf);
try {
this.env.buildEnvironment(conf, false);
} catch(Exception e) {
LOGGER.error("Environment for Wikipedia cannot be built", e);
}
}
}

public int getArticleCount() {
if (wikipediaArticleCount == -1)
wikipediaArticleCount = this.env.retrieveStatistic(StatisticName.articleCount).intValue();
if (wikipediaArticleCount == -1) {
Long articleCountStats = this.env.retrieveStatistic(StatisticName.articleCount);
if (articleCountStats != null) {
wikipediaArticleCount = articleCountStats.intValue();
} else {
return 0;
}
}

return wikipediaArticleCount;
}

public int getPageCount() {
if (wikipediaPageCount == -1) {
Long pageCount = this.env.getDbPage().getDatabaseSize();
if (pageCount != null) {
wikipediaPageCount = pageCount.intValue();
} else {
return 0;
}
}
return wikipediaPageCount;
}

/**
* Returns the environment that this is connected to
*/
Expand All @@ -60,17 +80,17 @@ public KBLowerEnvironment getEnvironment() {

/**
* Make ready the full content database of articles
*
*
*/
public void loadFullContentDB() {
try {
if (this.env != null)
this.env.buildFullMarkup(false);
else
else
LOGGER.error("Environment for Wikipedia full content article DB is null");
} catch(Exception e) {
LOGGER.error("Environment for Wikipedia full content cannot be built", e);
}
}
}

/**
Expand All @@ -82,7 +102,7 @@ public NerdConfig getConfig() {

/**
* Returns the root Category from which all other categories can be browsed.
*
*
*/
public com.scienceminer.nerd.kb.model.Category getRootCategory() {
return new com.scienceminer.nerd.kb.model.Category(env, env.retrieveStatistic(StatisticName.rootCategoryId).intValue());
Expand All @@ -96,11 +116,11 @@ public Page getPageById(int id) {
}

/**
* Returns the Page referenced by the given Wikidata id for the language of the Wikipedia.
* The page can be cast into the appropriate type for more specific functionality.
*
* Returns the Page referenced by the given Wikidata id for the language of the Wikipedia.
* The page can be cast into the appropriate type for more specific functionality.
*
* @param id the Wikidata id of the Page to retrieve.
* @return the Page referenced by the given id, or null if one does not exist.
* @return the Page referenced by the given id, or null if one does not exist.
*/
/*public Page getPageByWikidataId(String wikidataId) {
return Page.createPage(env, wikidataId);
Expand All @@ -121,7 +141,7 @@ public Page getPageById(int id) {
/**
* Returns the Article referenced by the given (case sensitive) title. If the title
* matches a redirect, this will be resolved to return the final target.
*
*
*/
public Article getArticleByTitle(String title) {
if (title == null || title.length() == 0)
Expand All @@ -144,7 +164,7 @@ public Article getArticleByTitle(String title) {
}

/**
* Returns the Category referenced by the given (case sensitive) title.
* Returns the Category referenced by the given (case sensitive) title.
*
*/
public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title) {
Expand All @@ -162,8 +182,8 @@ public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title)
}

/**
* Returns the Template referenced by the given (case sensitive) title.
*
* Returns the Template referenced by the given (case sensitive) title.
*
*/
public Template getTemplateByTitle(String title) {
title = title.substring(0,1).toUpperCase() + title.substring(1);
Expand All @@ -181,20 +201,20 @@ public Template getTemplateByTitle(String title) {


/**
* Returns the most probable article for a given term.
* Returns the most probable article for a given term.
*/
public Article getMostProbableArticle(String term) {
Label label = new Label(env, term);
if (!label.exists())
if (!label.exists())
return null;

return label.getSenses()[0];
}

/**
* A convenience method for quickly finding out if the given text is ever used as a label
* in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets.
*
* in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets.
*
*/
/*public boolean isLabel(String text) {
DbLabel lbl = env.getDbLabel().retrieve(text);
Expand All @@ -207,31 +227,31 @@ public Label getLabel(String text) {

/**
* Returns an iterator for all pages in the database, in order of ascending ids.
*
*
*/
public PageIterator getPageIterator() {
return new PageIterator(env);
}

/**
* Returns an iterator for all pages in the database of the given type, in order of ascending ids.
*
*
*/
public PageIterator getPageIterator(PageType type) {
return new PageIterator(env, type);
return new PageIterator(env, type);
}

/**
* Returns an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
*
*
*/
public LabelIterator getLabelIterator() {
return new LabelIterator(env);
}

/**
* Returns the list of links in relation to artId with the specified direction (in or out).
*
*
*/
public List<Integer> getLinks(int artId, Direction dir) {
DbIntList ids = null;
Expand All @@ -240,7 +260,7 @@ public List<Integer> getLinks(int artId, Direction dir) {
else
ids = env.getDbPageLinkOutNoSentences().retrieve(artId);

if (ids == null || ids.getValues() == null)
if (ids == null || ids.getValues() == null)
return new ArrayList<Integer>();

return ids.getValues();
Expand Down Expand Up @@ -286,7 +306,7 @@ public double getWordFrequency(String word) {
return 0.0;
else
return Utilities.cBToFrequency(cB.intValue());
}
}

/**
* @return frequency of word for the language
Expand All @@ -297,7 +317,7 @@ public double getWordZipf(String word) {
return 0.0;
else
return Utilities.cBToZipf(cB.intValue());
}
}

public void close() {
env.close();
Expand Down
Loading

0 comments on commit ee4ff92

Please sign in to comment.