Merge branch 'master' into grobid-0.8.1

# Conflicts: # build.gradle
kermitt2 · Jan 4, 2025 · ee4ff92 · ee4ff92
2 parents 8b73624 + 2a849b9
commit ee4ff92
Show file tree

Hide file tree

Showing 15 changed files with 267 additions and 70 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,8 @@ ARG BUILD_VERSION=0.0.6
 # -------------
 # builder image
 # -------------
-FROM openjdk:8u275-jdk as builder
+#FROM openjdk:8u275-jdk as builder
+FROM openjdk:17-jdk-slim as builder
 
 USER root
 ENV LANG="en_US.UTF-8" \
@@ -21,8 +22,8 @@ RUN apt-get update -y && \
 WORKDIR /opt/
 
 # install GROBID
-RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.7.2.zip
-RUN unzip -o 0.7.2.zip && mv grobid-* grobid
+RUN wget --tries=10 --read-timeout=10 https://github.com/kermitt2/grobid/archive/refs/tags/0.8.1.zip
+RUN unzip -o 0.8.1.zip && mv grobid-* grobid
 
 WORKDIR /opt/grobid
 
@@ -34,7 +35,7 @@ RUN rm -rf grobid-home/pdfalto/win-*
 RUN rm -rf grobid-home/lib/lin-32
 RUN rm -rf grobid-home/lib/win-*
 RUN rm -rf grobid-home/lib/mac-64
-RUN rm -rf ../0.7.2.zip
+RUN rm -rf ../0.8.1.zip
 
 # cleaning DeLFT models
 RUN rm -rf grobid-home/models/*-BidLSTM_CRF*
@@ -77,6 +78,12 @@ RUN apt-get update && \
 RUN apt-get update -y && \
     apt-get clean all -y
 
+# Add Tini
+ENV TINI_VERSION v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT ["/tini", "-s", "--"]
+
 WORKDIR /opt/grobid
 COPY --from=builder /opt/grobid .
 

diff --git a/build.gradle b/build.gradle
@@ -5,6 +5,7 @@ buildscript {
         maven {
             url 'https://plugins.gradle.org/m2/'
         }
+        maven { url "https://grobid.s3.eu-west-1.amazonaws.com/repo/" }
     }
     dependencies {
         classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0'
@@ -59,7 +60,7 @@ dependencies {
         //exclude(module: 'log4j-over-slf4j')
         exclude(group: 'ch.qos.logback', module: 'logback-classic')
     }
-    implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.0') {
+    implementation (group: 'org.grobid', name: 'grobid-ner', version: '0.8.1') {
         //exclude(module: 'log4j-over-slf4j')
         exclude(group: 'ch.qos.logback', module: 'logback-classic')
     }

diff --git a/doc/build.rst b/doc/build.rst
@@ -137,4 +137,10 @@ Prometheus metrics (e.g. for Graphana monitoring) are available at http://localh
 Creating a new Knowledge Base version 
 *************************************
 
-The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP <https://github.com/kermitt2/grisp>`_, see `https://github.com/kermitt2/grisp <https://github.com/kermitt2/grisp>`_. 
+The knowledge base used by *entity-fishing* can be updated with new versions of Wikidata and Wikipedia using the pre-processing from the library `GRISP <https://github.com/kermitt2/grisp>`_.
+
+The files generated by GRISP (see `listing all necessary files <https://github.com/kermitt2/grisp?tab=readme-ov-file#final-hierarchy-of-files>`_) should be used via the configuration:
+
+    - ``dataDirectory`` in the files ``wikipedia-XY.yml`` (with XY equal to the language, e.g. ``en``, ``fr``) for the Wikipedia related knowledge base. Note: The ``XYwiki-latest-pages-articles-multistream.xml.bz2`` can be left compressed
+
+    - ``dataDirectory`` in the file ``kb.yml`` for the Wikidata knowledge base (db-kb)
diff --git a/doc/restAPI.rst b/doc/restAPI.rst
@@ -1295,3 +1295,85 @@ Or in case of issues:
      "ok": "false",
      "message": "The customisation already exists."
    }
+
+Data and statistics API
+***********************
+
+GET /data
+^^^^^^^^^
+
+Retrieve information about the loaded data, showing the number of concept per Knowledge base.
+
+(1) Example response
+
+Here a sample of the response
+::
+    {
+      "upperKnowledgeBaseStatisticsCount": {
+        "Concepts": 113276007,
+        "Labels": 113331134,
+        "Statements": 112505569
+      },
+      "lowerKnowledgeBaseStatisticsCount": {
+        "de": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "hi": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "ru": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "sv": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "pt": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "en": {
+          "Pages": 20279663,
+          "Articles": 6649343
+        },
+        "it": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "fr": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "bn": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "es": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "zh": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "ar": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "uk": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "ja": {
+          "Pages": 0,
+          "Articles": 0
+        },
+        "fa": {
+          "Pages": 0,
+          "Articles": 0
+        }
+      }
+    }
diff --git a/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java b/src/main/java/com/scienceminer/nerd/kb/LowerKnowledgeBase.java
@@ -16,41 +16,61 @@
 import java.util.List;
 
 /**
- * Represent the language specific resources of the Knowledge Base, e.g. a 
- * Wikipedia instance, including corresponding word and entity embeddings. 
- * 
+ * Represent the language specific resources of the Knowledge Base, e.g. a
+ * Wikipedia instance, including corresponding word and entity embeddings.
+ *
  */
 public class LowerKnowledgeBase {
 
 	protected static final Logger LOGGER = LoggerFactory.getLogger(LowerKnowledgeBase.class);
 
 	private KBLowerEnvironment env = null;
 	private int wikipediaArticleCount = -1;
+	private int wikipediaPageCount = -1;
 
 	public enum Direction {
-		In, 
+		In,
 		Out
 	}
 
 	/**
-	 * Initialises a newly created Wikipedia according to the given configuration. 
-	 *  
+	 * Initialises a newly created Wikipedia according to the given configuration.
+	 *
 	 */
 	public LowerKnowledgeBase(NerdConfig conf) {
 		this.env = new KBLowerEnvironment(conf);
 		try {
 			this.env.buildEnvironment(conf, false);
 		} catch(Exception e) {
 			LOGGER.error("Environment for Wikipedia cannot be built", e);
-		} 
+		}
 	}
 
 	public int getArticleCount() {
-		if (wikipediaArticleCount == -1)
-			wikipediaArticleCount = this.env.retrieveStatistic(StatisticName.articleCount).intValue();
+		if (wikipediaArticleCount == -1) {
+			Long articleCountStats = this.env.retrieveStatistic(StatisticName.articleCount);
+			if (articleCountStats != null) {
+				wikipediaArticleCount = articleCountStats.intValue();
+			} else {
+				return 0;
+			}
+		}
+
 		return wikipediaArticleCount;
 	}
 
+    public int getPageCount() {
+        if (wikipediaPageCount == -1) {
+            Long pageCount = this.env.getDbPage().getDatabaseSize();
+            if (pageCount != null) {
+                wikipediaPageCount = pageCount.intValue();
+            } else {
+                return 0;
+            }
+        }
+        return wikipediaPageCount;
+    }
+
 	/**
 	 * Returns the environment that this is connected to
 	 */
@@ -60,17 +80,17 @@ public KBLowerEnvironment getEnvironment() {
 
 	/**
 	 * Make ready the full content database of articles
-	 * 
+	 *
 	 */
 	public void loadFullContentDB() {
 		try {
 			if (this.env != null)
 				this.env.buildFullMarkup(false);
-			else 
+			else
 				LOGGER.error("Environment for Wikipedia full content article DB is null");
 		} catch(Exception e) {
 			LOGGER.error("Environment for Wikipedia full content cannot be built", e);
-		} 
+		}
 	}
 
 	/**
@@ -82,7 +102,7 @@ public NerdConfig getConfig() {
 
 	/**
 	 * Returns the root Category from which all other categories can be browsed.
-	 * 
+	 *
 	 */
 	public com.scienceminer.nerd.kb.model.Category getRootCategory() {
 		return new com.scienceminer.nerd.kb.model.Category(env, env.retrieveStatistic(StatisticName.rootCategoryId).intValue());
@@ -96,11 +116,11 @@ public Page getPageById(int id) {
 	}
 
 	/**
-	 * Returns the Page referenced by the given Wikidata id for the language of the Wikipedia. 
-	 * The page can be cast into the appropriate type for more specific functionality. 
-	 *  
+	 * Returns the Page referenced by the given Wikidata id for the language of the Wikipedia.
+	 * The page can be cast into the appropriate type for more specific functionality.
+	 *
 	 * @param id the Wikidata id of the Page to retrieve.
-	 * @return the Page referenced by the given id, or null if one does not exist. 
+	 * @return the Page referenced by the given id, or null if one does not exist.
 	 */
 	/*public Page getPageByWikidataId(String wikidataId) {
 		return Page.createPage(env, wikidataId);
@@ -121,7 +141,7 @@ public Page getPageById(int id) {
 	/**
 	 * Returns the Article referenced by the given (case sensitive) title. If the title
 	 * matches a redirect, this will be resolved to return the final target.
-	 * 
+	 *
 	 */
 	public Article getArticleByTitle(String title) {
 		if (title == null || title.length() == 0)
@@ -144,7 +164,7 @@ public Article getArticleByTitle(String title) {
 	}
 
 	/**
-	 * Returns the Category referenced by the given (case sensitive) title. 
+	 * Returns the Category referenced by the given (case sensitive) title.
 	 *
 	 */
 	public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title) {
@@ -162,8 +182,8 @@ public com.scienceminer.nerd.kb.model.Category getCategoryByTitle(String title)
 	}
 
 	/**
-	 * Returns the Template referenced by the given (case sensitive) title. 
-	 *  
+	 * Returns the Template referenced by the given (case sensitive) title.
+	 *
 	 */
 	public Template getTemplateByTitle(String title) {
 		title = title.substring(0,1).toUpperCase() + title.substring(1);
@@ -181,20 +201,20 @@ public Template getTemplateByTitle(String title) {
 
 
 	/**
-	 * Returns the most probable article for a given term. 
+	 * Returns the most probable article for a given term.
 	 */
 	public Article getMostProbableArticle(String term) {
 		Label label = new Label(env, term);
-		if (!label.exists()) 
+		if (!label.exists())
 			return null;
 
 		return label.getSenses()[0];
 	}
 
 	/**
 	 * A convenience method for quickly finding out if the given text is ever used as a label
-	 * in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets. 
-	 * 
+	 * in Wikipedia. If this returns false, then all of the getArticle methods will return null or empty sets.
+	 *
 	 */
 	/*public boolean isLabel(String text)  {
 		DbLabel lbl = env.getDbLabel().retrieve(text); 
@@ -207,31 +227,31 @@ public Label getLabel(String text)  {
 
 	/**
 	 * Returns an iterator for all pages in the database, in order of ascending ids.
-	 * 
+	 *
 	 */
 	public PageIterator getPageIterator() {
 		return new PageIterator(env);
 	}
 
 	/**
 	 * Returns an iterator for all pages in the database of the given type, in order of ascending ids.
-	 * 
+	 *
 	 */
 	public PageIterator getPageIterator(PageType type) {
-		return new PageIterator(env, type);		
+		return new PageIterator(env, type);
 	}
 
 	/**
 	 * Returns an iterator for all labels in the database, processed according to the given text processor (may be null), in alphabetical order.
-	 * 
+	 *
 	 */
 	public LabelIterator getLabelIterator() {
 		return new LabelIterator(env);
 	}
 
 	/**
 	 * Returns the list of links in relation to artId with the specified direction (in or out).
-	 * 
+	 *
 	 */
 	public List<Integer> getLinks(int artId, Direction dir) {
 		DbIntList ids = null;
@@ -240,7 +260,7 @@ public List<Integer> getLinks(int artId, Direction dir) {
 		else
 			ids = env.getDbPageLinkOutNoSentences().retrieve(artId);
 
-		if (ids == null || ids.getValues() == null) 
+		if (ids == null || ids.getValues() == null)
 			return new ArrayList<Integer>();
 
 		return ids.getValues();
@@ -286,7 +306,7 @@ public double getWordFrequency(String word) {
      		return 0.0;
      	else
 	     	return Utilities.cBToFrequency(cB.intValue());
-    } 
+    }
 
     /**
      * @return frequency of word for the language
@@ -297,7 +317,7 @@ public double getWordZipf(String word) {
      		return 0.0;
      	else
 	     	return Utilities.cBToZipf(cB.intValue());
-    } 
+    }
 
 	public void close() {
 		env.close();