Skip to content

Commit

Permalink
News Corp Australia: Scraping updates for more recent website editions (
Browse files Browse the repository at this point in the history
  • Loading branch information
zoe-translates authored Jul 14, 2023
1 parent d68b0aa commit 4c8a8d0
Showing 1 changed file with 48 additions and 13 deletions.
61 changes: 48 additions & 13 deletions News Corp Australia.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2021-06-01 16:55:37"
"lastUpdated": "2023-07-14 10:46:05"
}

/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2021 Abe Jellinek
Copyright © 2021 Abe Jellinek and contributors
This file is part of Zotero.
Expand Down Expand Up @@ -78,17 +78,24 @@ function scrape(doc, url) {
let meta = JSON.parse(text(doc, jsonSelector));

let item = new Zotero.Item('newspaperArticle');
item.title = meta.headline;
// Prefer the title on the page over metadata, because the latter may be a
// different title made for social media / SEO
item.title = text(doc, "#story-headline, .story-headline__title, .long-headline") || meta.headline;
item.date = ZU.strToISO(meta.datePublished);
item.abstractNote = meta.description;
if (meta.author) {
item.creators = meta.author.name
.split(/ and |, /)
.map(name => ZU.cleanAuthor(name, 'author', false));
}
item.publicationTitle = meta.publisher.name;
item.creators = getAuthors(meta.author, doc);
// Get rid of trailing strings inserted after publisher name for SEO
// purposes
item.publicationTitle = meta.publisher.name.match(/^(.+?)(?:[—|].*)?$/)[1].trim();
item.section = getSection(doc);
item.url = meta.mainEntityOfPage['@id'];
if (meta.mainEntityOfPage["@type"] === "WebPage"
&& /^https?:\/\//.test(meta.mainEntityOfPage["@id"])) {
// Page URL from JSON-LD should be less prone to line noise
item.url = meta.mainEntityOfPage["@id"];
}
if (!item.url) {
item.url = url;
}
item.libraryCatalog = '';
item.attachments.push({
title: "Snapshot",
Expand All @@ -99,14 +106,42 @@ function scrape(doc, url) {
item.complete();
}

function getAuthors(authorFromJSON, doc) {
let byline = authorFromJSON;
if (byline) {
// The author field may be an array (multiple authors) or an atom
// object (one author)
if (!Array.isArray(byline)) {
byline = [byline];
}
}
else {
// Fallback to scraping
byline = doc.querySelectorAll("meta[name='article:author']");
if (!byline) {
return null;
}
byline = Array.from(byline);
}
// The "author" field may be malformed (not an array, but the "name" field
// is a string with separators between author names). Deal with "," and
// "and" as separator.
return byline.flatMap(obj => obj.name
.split(/, |and /i)
.map(str => str.trim())
.filter(Boolean)
.map(str => ZU.cleanAuthor(str, "author", false))
);
}

function getSection(doc) {
let breadcrumbs = doc.querySelector('#breadcrumbs');
if (!breadcrumbs || breadcrumbs.childElementCount < 2) {
let breadCrumbItems = doc.querySelectorAll(".breadcrumbs__item, .breadcrumbs_li");
if (breadCrumbItems.length < 2) {
// we want to return null, not '', if we can't find a section tag
return text(doc, '.tg-tlc-storyheader_sectiontag') || null;
}

return ZU.capitalizeTitle(breadcrumbs.lastChild.innerText, true);
return ZU.capitalizeTitle(breadCrumbItems[breadCrumbItems.length - 1].innerText, true);
}

/** BEGIN TEST CASES **/
Expand Down

0 comments on commit 4c8a8d0

Please sign in to comment.