Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Dalloz, Dalloz Bibliothèque, Lextenso, Vie Publique and Jus Politicum translators. #3301

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
113 changes: 113 additions & 0 deletions Dalloz Bibliotheque.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"translatorID": "2ea86ad9-71ca-410c-9126-9d7d98722acf",
"label": "Dalloz Bibliothèque",
"creator": "Alexandre Mimms",
"target": "https?://(?:www[.-])?bibliotheque[.-]lefebvre[.-]dalloz(?:[.-]fr)?",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing the [.-]s are to support proxies? We don't do that here. Add your proxy to the Zotero Connector and the translator will get correct URLs automatically.

Suggested change
"target": "https?://(?:www[.-])?bibliotheque[.-]lefebvre[.-]dalloz(?:[.-]fr)?",
"target": "https?://(www\\.)?bibliotheque\\.lefebvre\\.dalloz\\.fr",

"minVersion": "5.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2024-04-18 12:43:54"
}

/*
***** BEGIN LICENSE BLOCK *****

Copyright © 2024 Alexandre Mimms

This file is part of Zotero.

Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.

***** END LICENSE BLOCK *****
*/


function detectWeb(doc, url) {
if (url.includes('/ouvrage/')) {
return 'book';
}
else if (url.includes('/recherche')) {
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
return 'multiple';
}
return false;
}

function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll('.result-list-grid-item');

for (let row of rows) {
let href = row.querySelectorAll("a")[0].href;
let title = ZU.trimInternal(row.querySelectorAll(".detail-title")[0].innerText);
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
}
return found ? items : false;
}

async function doWeb(doc, url) {
if (detectWeb(doc, url) == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, false));
if (!items) return;
for (let url of Object.keys(items)) {
await scrape(await requestDocument(url));
}
}
else {
await scrape(doc, url);
}
}

async function scrape(doc, url = doc.location.href) {
const edition = ZU.trimInternal(text(doc, ".editions-edition.css-p7sjbi", 0)).split(" ")[0];
const date = ZU.trimInternal(text(doc, ".editions-date.css-p7sjbi", 0)).replace(/Edition\s?:\s?/, "");
const collection = ZU.trimInternal(text(doc, ".notice-header-grid-item.css-1o256gd.e4d31s30:not(.first-item) .notice-header-link", 0));
const isbn = ZU.trimInternal(text(doc, ".notice-header-grid-item.css-leol38.e4d31s30 .notice-header-link", 0));
let marque = ZU.trimInternal(text(doc, ".notice-header-grid-item.css-xc5jw0.e4d31s30 .notice-header-link", 0));
marque = marque.substring(0, 1) + marque.substring(1).toLowerCase();
const auteurs = ZU.trimInternal(text(doc, ".notice-header-grid-item.css-2bwjgy.e4d31s30 .notice-header-link", 0)).split(" • ");
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
const titre = ZU.trimInternal(text(doc, ".title", 0));
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
const abstract = ZU.trimInternal(text(doc, ".description", 0)).replace("Description", "");

let newItem = new Z.Item("book");

for (let auteur of auteurs) {
const auteurNames = auteur.split(" ");
newItem.creators.push({
firstName: auteurNames[0],
lastName: auteurNames[1],
creatorType: "author",
fieldMode: true
});
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
}

newItem.title = titre;
newItem.date = date;
newItem.abstractNote = abstract;
newItem.ISBN = isbn;
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
newItem.edition = edition;
newItem.publisher = marque;
newItem.language = "french";
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
newItem.series = collection;
newItem.url = url;
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved

newItem.complete();
}

carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
242 changes: 242 additions & 0 deletions Dalloz.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"translatorID": "a59e99a6-42b0-4be6-bb0c-1ff688c3a8b3",
"label": "Dalloz",
"creator": "Alexandre Mimms",
"target": "https?://(?:www[.-])?dalloz(?:[.-]fr)?",
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
"minVersion": "5.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2024-04-18 12:12:00"
}

/*
***** BEGIN LICENSE BLOCK *****

Copyright © 2024 Alexandre Mimms

This file is part of Zotero.

Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.

***** END LICENSE BLOCK *****
*/
// TODO
// - Make sure that the case report are correctly saved.
// - PDF import : needs reverse engineering the internal api of the service. Seems like a quite complex one.

const citationAvecNumero = new RegExp(/^([\D]+)\s*(\d{4}),?\s?(n°\s?\d+) *,?\s*(p\.\s?\d+)*/);
const citationSansNumero = new RegExp(/^([\D]+)\s*(\d{4}),?\s*(p\.\d+)?/);
const regAnnee = new RegExp(/\d{4}/);
const docTypeId = new RegExp(/id=([^%]+)(?:%2F)?/);
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved

const codeDocument = new Map([
["ENCY", "dictionary-entry"],
["JA", "journalArticle"],
["AJ", "journalArticle"],
["ACTU", "blogPost"],
["RFDA", "journalArticle"],
["CONS", "journalArticle"],
["DIPI", "journalArticle"],
["DS", "journalArticle"],
["JA", "journalArticle"],
["JT", "journalArticle"],
["JS", "journalArticle"],
["JCAS", "journalArticle"],
["LEGI", "journalArticle"],
["CAHJ", "journalArticle"],
["RDI", "journalArticle"],
["RDSS", "journalArticle"],
["RECU", "journalArticle"],
["LEBO", "case"],
["REV", "journalArticle"],
["RMC", "journalArticle"],
["RSC", "journalArticle"],
["RTD", "journalArticle"],
["RPR", "journalArticle"],
["RCJ", "journalArticle"]
]);

// The following function checks if the ID passed as argument has an associated key (some IDs start with the same letters - easier than filing all available IDs).
function idStartsWithKey(string) {
for (let key of codeDocument.keys()) {
if (key.startsWith(string.substring(0, 2))) {
return true;
}
}
return false;
}

function scrapeJournalArticle(doc, url = doc.location.href) {
// Since searches trigger a "< >" markup around the searched words, we have to edit that away before storing the values.
const titre = ZU.trimInternal(text(doc, ".chronTITRE", 0)).replace(/[<>]/g, ""); // gets the title of the document
const abstract = ZU.trimInternal(text(doc, "#RESUFRAN")).replace(/[<>]/g, ""); // gets the abstract
let refDoc = ZU.trimInternal(text(doc, ".refDoc", 0).replace(/[<>]/g, "")); // gets the reference

let page, revue, numRevue, date;
const signatures = doc.querySelectorAll(".chronSIGNATURE");
let auteurs = [];

// Loop over the "signatures" of the document, and store the author in the list.
for (let signature of signatures) {
auteurs.push(signature.innerText.replace(/[<>]/g, "").split(',')[0]);
}

if (citationAvecNumero.test(refDoc)) {
refDoc = refDoc.split(citationAvecNumero);
}
else if (citationSansNumero.test(refDoc)) {
refDoc = refDoc.split(citationSansNumero);
}

for (let item of refDoc) {
if (item.startsWith("p")) {
page = item.replace("p.", "");
}
else if (item.startsWith("n")) {
numRevue = item.replace("n°", "");
}
else if (regAnnee.test(item)) {
date = item;
}
else if (item !== "") {
revue = item;
}
}

let newItem = new Z.Item("journalArticle");

newItem.title = titre;
for (let auth of auteurs) { // loop over the list of authors and set them as authors.
let authNames = auth.split(" ");
newItem.creators.push({
firstName: authNames[0],
lastName: authNames[1],
creatorType: "author",
fieldMode: true
});
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
}

newItem.publicationTitle = revue;
newItem.abstractNote = abstract;
if (numRevue !== "") newItem.issue = numRevue;
newItem.pages = page;
newItem.date = date;
newItem.url = url;
newItem.language = "french";
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
newItem.complete();
}

function scrapeCase(doc, url = doc.location.href) {
let juridiction;

if (url.includes("LEBON")) {
juridiction = "Conseil d'État";
}

// Since searches trigger a "< >" markup around the searched words, we have to edit that away before storing the values.
const titre = ZU.trimInternal(text(doc, ".jurisJURI", 0)).replace(/[<>]/g, ""); // gets the title of the document
const abstract = ZU.trimInternal(text(doc, ".jurisSOMMAIRE")).replace(/[<>]/g, ""); // gets the abstract
const formation = ZU.trimInternal(text(doc, ".jurisCHAM", 0).replace(/[<>]/g, "")); // gets the reference
const date = ZU.trimInternal(text(doc, ".jurisDATE", 0).replace(/[<>]/g, ""));
const volume = date.split("-")[2];
const mentionPublication = ZU.trimInternal(text(doc, ".commentPopupNDC b", 0).replace(/[<>]/g, ""));
const numeroAffaire = ZU.trimInternal(text(doc, ".jurisNAAF", 0).replace(/[<>]/g, "").replace("n° ", ""));

let newItem = new Z.Item("case");
newItem.caseName = titre;
newItem.reporter = mentionPublication;
newItem.abstractNote = abstract.replace("Sommaire : ", "");
newItem.court = juridiction;
newItem.dateDecided = date;
newItem.reporterVolume = volume;
newItem.docketNumber = numeroAffaire;
newItem.language = "french";
newItem.url = url;
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
newItem.extra = formation;
newItem.complete();
}

// function scrapeBlog(doc, url = doc.location.href) {

// }
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved

function detectWeb(doc, url) {
if (url.includes('/documentation/Document')) { // Checks if the page is a document.
let id = url.match(docTypeId);
id = id[1].substring(0, 4);
if (idStartsWithKey(id)) {
if (codeDocument.get(id)) { // If there is a corresponding ID.
return codeDocument.get(id);
}
return codeDocument.get(id.substring(0, 2)); // Gets the value of the key if it is a shorthand.
// Returns the type of the document according to the ID - refer to the const Map declared.
}
}
else if (url.includes('/documentation/Liste')) { // Checks if the page is a list of results.
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
return 'multiple';
}
return false;
}
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved

// This function is basically as it was set by the template. I modified it so it is specific to Dalloz.
function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll('.result-content');
for (let row of rows) {
let href = attr(row, "a", "href", 0);
let title = ZU.trimInternal(text(row, "a", 0));
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
}
return found ? items : false;
}

// Nothing changed here neither.
async function doWeb(doc, url) {
const docType = detectWeb(doc, url); // calling detectWeb once and passing it to scrape function,
// so we don't have to call it multiple times to check in the scrape function what type of document it is.

if (docType == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, false));
if (!items) return;
for (let url of Object.keys(items)) {
await scrape(await requestDocument(url));
}
}
else {
await scrape(doc, url, docType);
}
}

async function scrape(doc, url = doc.location.href, docType) {
if (docType == "journalArticle") {
scrapeJournalArticle(doc, url);
}
else if (docType == "case") {
scrapeCase(doc, url);
}
// else if (docType == "blogPost") {
// scrapeBlog(doc, url);
// }
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
}
/** BEGIN TEST CASES **/
var testCases = [
carnetdethese marked this conversation as resolved.
Show resolved Hide resolved
]
/** END TEST CASES **/
Loading
Loading