Initial Commit
This commit is contained in:
298
translators/PRC History Review.js
Normal file
298
translators/PRC History Review.js
Normal file
@@ -0,0 +1,298 @@
|
||||
{
|
||||
"translatorID": "56854750-868a-4de0-bfe5-fe075344a121",
|
||||
"label": "PRC History Review",
|
||||
"creator": "Bo An",
|
||||
"target": "^https?://(www\\.)?prchistory\\.org/",
|
||||
"minVersion": "3.0",
|
||||
"maxVersion": "",
|
||||
"priority": 100,
|
||||
"inRepository": true,
|
||||
"translatorType": 4,
|
||||
"browserSupport": "gcsibv",
|
||||
"lastUpdated": "2021-12-30 18:33:59"
|
||||
}
|
||||
|
||||
/*
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
|
||||
Copyright © 2021 Bo An
|
||||
|
||||
This file is part of Zotero.
|
||||
|
||||
Zotero is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Zotero is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
*/
|
||||
|
||||
function detectWeb(doc, url) {
|
||||
const articles = getArticles(doc, url);
|
||||
if (!articles) {
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
return 'multiple';
|
||||
}
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
const articles = getArticles(doc, url);
|
||||
if (articles) {
|
||||
Zotero.selectItems(articles, (items) => {
|
||||
if (!items) {
|
||||
return true;
|
||||
}
|
||||
for (const i in items) {
|
||||
let isIssue = false;
|
||||
let isSingleArticle = false;
|
||||
let isHomePage = isHomePageUrl(url);
|
||||
// if it's the homepage, treat links as individual journal articles without a dedicated child page, otherwise as issue articles.
|
||||
if (!isHomePage) {
|
||||
// if it's not homepage, further check if it is single article issue, or multiple article issue.
|
||||
isSingleArticle = Object.keys(articles).length === 1;
|
||||
// when there is only one article on a child page. This usually is the case with a single research paper with its own page. Note this is NOT how many items are selected by the user to add to Zotero.
|
||||
isIssue = !isSingleArticle;
|
||||
}
|
||||
// pass issue information to scraper
|
||||
scrape(doc, i, isIssue, isSingleArticle);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function getArticles(doc, url) {
|
||||
let items = {};
|
||||
let found = false;
|
||||
|
||||
// since PRC history review provides downlodable links on both the home page and its children issue pages, we need to:
|
||||
if (isHomePageUrl(url)) {
|
||||
const pdfLinkEls = doc.querySelectorAll('a');
|
||||
pdfLinkEls.forEach((link) => {
|
||||
const href = link.href;
|
||||
const hasPDF = isPdfUrl(href);
|
||||
if (hasPDF) {
|
||||
items[href] = link.textContent;
|
||||
if (found === false) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
else {
|
||||
const bookLinkEls = doc.querySelectorAll('p, h5');
|
||||
bookLinkEls.forEach((bookLinkEl) => {
|
||||
const link = bookLinkEl.querySelectorAll('a')[0];
|
||||
if (link) {
|
||||
const href = link.href;
|
||||
const hasPDF = isPdfUrl(href);
|
||||
if (hasPDF) {
|
||||
const title = link.textContent;
|
||||
|
||||
if (!title.toLowerCase().includes('here') && title.length > 6) {
|
||||
items[href] = title;
|
||||
if (found === false) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
return found ? items : false;
|
||||
}
|
||||
|
||||
|
||||
// since PRC history review often directly link to pdf, the url here, unlike press journal articles, is often an PDF link.
|
||||
function scrape(doc, url, isIssue, isSingleArticle) {
|
||||
let articleLinkEls;
|
||||
|
||||
if (isIssue) {
|
||||
articleLinkEls = doc.querySelectorAll('p, h5');
|
||||
}
|
||||
else {
|
||||
articleLinkEls = doc.querySelectorAll('h5, h6');
|
||||
}
|
||||
|
||||
// in order to decouple scraping from detection, the same process of finding a journal article link is repeated to access the text info associated with that link. That way, detecting functions do not need to provide any other info than the url.
|
||||
articleLinkEls.forEach((articleDivEl, index) => {
|
||||
const linkEl = articleDivEl.querySelectorAll("a")[0];
|
||||
if (!linkEl) {
|
||||
return;
|
||||
}
|
||||
const href = linkEl.href;
|
||||
if (href === url) {
|
||||
const hasPDF = isPdfUrl(href);
|
||||
if (hasPDF) {
|
||||
const newItem = new Zotero.Item('journalArticle');
|
||||
|
||||
newItem.publicationTitle = "The PRC History Review";
|
||||
|
||||
// add title
|
||||
if (isIssue) {
|
||||
newItem.title = linkEl.textContent;
|
||||
}
|
||||
else if (isSingleArticle) {
|
||||
const rawTitle = linkEl.textContent;
|
||||
const rawTitleParts = rawTitle.split(',');
|
||||
rawTitleParts.shift();
|
||||
newItem.title = rawTitleParts.join(",").replace(/[”“]/g, '');
|
||||
}
|
||||
else {
|
||||
// if it is homepage link, parse the one-link raw title into title and other info.
|
||||
const rawTitle = linkEl.textContent;
|
||||
const rawTitleParts = rawTitle.split(':');
|
||||
const firstPart = rawTitleParts[0];
|
||||
|
||||
const hasSeriesInfo = firstPart && firstPart.toLowerCase().endsWith('series');
|
||||
if (hasSeriesInfo) {
|
||||
newItem.seriesTitle = firstPart;
|
||||
// remove series title, leaving the rest as title;
|
||||
rawTitleParts.shift();
|
||||
newItem.title = rawTitleParts.join(":");
|
||||
}
|
||||
else {
|
||||
newItem.title = rawTitle;
|
||||
}
|
||||
}
|
||||
|
||||
// add issue info
|
||||
if (isIssue) {
|
||||
const issueInfoEl = doc.querySelectorAll('h5')[0];
|
||||
if (issueInfoEl) {
|
||||
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||||
if (issueInfoArray.length === 3) {
|
||||
const volume = issueInfoArray[0];
|
||||
newItem.volume = volume.toLowerCase().replace('volume', '').trim();
|
||||
const issue = issueInfoArray[1];
|
||||
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||||
const date = issueInfoArray[2];
|
||||
newItem.date = ZU.strToISO(date);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (isSingleArticle) {
|
||||
// if it's single article page, the issue info is two divs above the link div.
|
||||
const issueInfoElIndex = index - 2;
|
||||
if (issueInfoElIndex >= 0) {
|
||||
const issueInfoEl = doc.querySelectorAll('h5')[issueInfoElIndex];
|
||||
if (issueInfoEl) {
|
||||
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||||
if (issueInfoArray.length === 3) {
|
||||
const volume = issueInfoArray[0];
|
||||
newItem.volume = volume.toLowerCase().replace('volume', '').trim();
|
||||
const issue = issueInfoArray[1];
|
||||
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||||
const date = issueInfoArray[2];
|
||||
newItem.date = ZU.strToISO(date);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// if it's homepage article, the issue info is in the previous div.
|
||||
const lastIndex = index - 1;
|
||||
if (lastIndex >= 0) {
|
||||
const issueInfoEl = articleLinkEls[lastIndex];
|
||||
const issueInfoArray = issueInfoEl.textContent.split('★');
|
||||
if (issueInfoArray.length === 2) {
|
||||
const issue = issueInfoArray[0];
|
||||
newItem.issue = issue.toLowerCase().replace('number', '').trim();
|
||||
const date = issueInfoArray[1];
|
||||
newItem.date = ZU.strToISO(date);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add author
|
||||
// for issue authors
|
||||
if (isIssue) {
|
||||
const authorEl = articleLinkEls[index + 1];
|
||||
if (authorEl) {
|
||||
// in case there are multiple authors
|
||||
const authorTexts = authorEl.textContent.trim().split(' and ');
|
||||
authorTexts.forEach((authorText) => {
|
||||
const authorName = authorText.split(',')[0];
|
||||
if (authorName) {
|
||||
newItem.creators.push(ZU.cleanAuthor(authorName, 'author', false));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
// for single article issue's author information
|
||||
if (isSingleArticle) {
|
||||
const rawAuthorText = linkEl.textContent.split(',')[0];
|
||||
if (rawAuthorText) {
|
||||
const authorTexts = rawAuthorText.trim().split(' and ');
|
||||
authorTexts.forEach((authorText) => {
|
||||
const authorName = authorText.split(',')[0];
|
||||
if (authorName) {
|
||||
newItem.creators.push(ZU.cleanAuthor(authorName, 'author', false));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Download pdf
|
||||
const pdfUrl = url;
|
||||
if (pdfUrl && isPdfUrl) {
|
||||
newItem.attachments.push({
|
||||
url: pdfUrl,
|
||||
mimeType: "application/pdf",
|
||||
});
|
||||
}
|
||||
newItem.complete();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// helper functions.
|
||||
function isPdfUrl(url) {
|
||||
return url.toLowerCase().endsWith('.pdf');
|
||||
}
|
||||
|
||||
function isHomePageUrl(url) {
|
||||
return url.endsWith('the-prc-history-review/');
|
||||
}
|
||||
|
||||
|
||||
/** BEGIN TEST CASES **/
|
||||
var testCases = [
|
||||
{
|
||||
"type": "web",
|
||||
"url": "http://prchistory.org/review-october-2021/",
|
||||
"items": "multiple"
|
||||
},
|
||||
{
|
||||
"type": "web",
|
||||
"url": "http://prchistory.org/the-prc-history-review-5-2/",
|
||||
"items": "multiple"
|
||||
},
|
||||
{
|
||||
"type": "web",
|
||||
"url": "http://prchistory.org/review-october-2017/",
|
||||
"items": "multiple"
|
||||
},
|
||||
{
|
||||
"type": "web",
|
||||
"url": "http://prchistory.org/review-april-2017/",
|
||||
"items": "multiple"
|
||||
},
|
||||
{
|
||||
"type": "web",
|
||||
"url": "http://prchistory.org/issue_6_3/",
|
||||
"items": "multiple"
|
||||
}
|
||||
]
|
||||
/** END TEST CASES **/
|
||||
Reference in New Issue
Block a user