Skip to content

Commit b0c57a7

Browse files
authoredMar 31, 2022
Merge pull request #59 from poiw-org/wordpress-powerup
Wordpress powerup
2 parents 388c8bb + 893eb3c commit b0c57a7

12 files changed

+320
-139
lines changed
 

‎src/components/articles.ts

+40-28
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,28 @@ import Source from "./source";
22
import randomId from "../middleware/randomId"
33
import hash from 'crypto-js/sha256';
44

5+
export type Attachment = {
6+
attribute: string;
7+
value?: string;
8+
text?: string;
9+
};
10+
511
export default class Article {
6-
declare id: string
7-
declare title: string
8-
declare content: string
9-
declare link: string
10-
declare pubDate: string
11-
declare timestamp: number
12-
declare hash: string
13-
declare extras: {[key: string]: any}
12+
declare id: string;
13+
declare title: string;
14+
declare content: string;
15+
declare link: string;
16+
declare pubDate: string;
17+
declare timestamp: number;
18+
declare hash: string;
19+
declare extras: {[key: string]: any};
1420
declare source: {
15-
id: string
16-
name: string
17-
}
18-
declare attachments: object[]
19-
declare categories: object[]
21+
id: string;
22+
name: string;
23+
};
24+
declare attachments: object[];
25+
declare categories: object[];
26+
declare thumbnail: string;
2027

2128
/**
2229
* Article constructor
@@ -41,17 +48,18 @@ export default class Article {
4148
* Parse the article class to a json object
4249
*/
4350
static fromJSON(json: any): Article {
44-
const art = new Article(json.id)
45-
art.timestamp = json.timestamp
46-
art.title = json.title
47-
art.source = json.source
48-
art.link = json.link
49-
art.pubDate = json.pubDate
50-
art.content = json.content
51-
art.extras = json.extras
52-
art.hash = json.hash
53-
art.attachments = json.attachments
54-
art.categories = json.categories
51+
const art = new Article(json.id);
52+
art.timestamp = json.timestamp;
53+
art.title = json.title;
54+
art.source = json.source;
55+
art.link = json.link;
56+
art.pubDate = json.pubDate;
57+
art.content = json.content;
58+
art.extras = json.extras;
59+
art.hash = json.hash;
60+
art.attachments = json.attachments;
61+
art.categories = json.categories;
62+
art.thumbnail = json.thumbnail;
5563
return art
5664

5765
}
@@ -68,9 +76,13 @@ export default class Article {
6876
*/
6977
getHash() {
7078
if (!this.hash)
71-
this.hash = (hash(`${this.title} ${this.content} ${this.extras?.toString()} ${this.source.id}`)).toString()
79+
this.hash = (hash(`${this.title} ${this.content} ${this.extras?.toString()} ${this.source.id}`)).toString();
80+
81+
return this.hash;
82+
}
7283

73-
return this.hash
84+
public setThumbnail(thumbnail: string) {
85+
this.thumbnail = thumbnail;
7486
}
7587

7688
public setTitle(title: string) {
@@ -100,14 +112,14 @@ export default class Article {
100112
this.source = {id, name};
101113
}
102114

103-
public pushAttachment(attachment: object) {
115+
public pushAttachment(attachment: Attachment) {
104116
if(typeof this.attachments === 'undefined')
105117
this.attachments = [];
106118

107119
this.attachments.push(attachment);
108120
}
109121

110-
public pushAttachments(attachments: object[]) {
122+
public pushAttachments(attachments: Attachment[]) {
111123
if(typeof this.attachments === 'undefined')
112124
this.attachments = [];
113125

‎src/modules/workers/parsers/ParserLoader.ts

+16-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import {ParserType} from "./ParserType";
22
import {HTMLParser} from "./drivers/HTMLParser";
33
import {RSSParser} from "./drivers/RSSParser";
4-
import {WordpressParser} from "./drivers/WordpressParser";
4+
import {WordpressV2Parser} from "./drivers/wordpress/WordpressV2Parser";
55
import {DynamicParser} from "./drivers/DynamicParser";
66
import {ParserClass} from "./ParserClass";
77
import Instructions from "../../../components/instructions";
8+
import {WordpressV1Parser} from "./drivers/wordpress/WordpressV1Parser";
89

910
export default class ParserLoader {
1011

@@ -16,8 +17,11 @@ export default class ParserLoader {
1617
case ParserType.RSS:
1718
new RSSParser().validateScrape(scrapeOptions);
1819
break;
19-
case ParserType.WORDPRESS:
20-
new WordpressParser().validateScrape(scrapeOptions);
20+
case ParserType.WORDPRESS_V1:
21+
new WordpressV1Parser().validateScrape(scrapeOptions);
22+
break;
23+
case ParserType.WORDPRESS_V2:
24+
new WordpressV2Parser().validateScrape(scrapeOptions);
2125
break;
2226
case ParserType.DYNAMIC:
2327
new DynamicParser().validateScrape(scrapeOptions);
@@ -33,8 +37,11 @@ export default class ParserLoader {
3337
case ParserType.RSS:
3438
new RSSParser().assignInstructions(instructions, sourceJson);
3539
break
36-
case ParserType.WORDPRESS:
37-
new WordpressParser().assignInstructions(instructions, sourceJson);
40+
case ParserType.WORDPRESS_V1:
41+
new WordpressV1Parser().assignInstructions(instructions, sourceJson);
42+
break
43+
case ParserType.WORDPRESS_V2:
44+
new WordpressV2Parser().assignInstructions(instructions, sourceJson);
3845
break
3946
case ParserType.DYNAMIC:
4047
new DynamicParser().assignInstructions(instructions, sourceJson);
@@ -48,8 +55,10 @@ export default class ParserLoader {
4855
return new HTMLParser();
4956
case ParserType.RSS:
5057
return new RSSParser();
51-
case ParserType.WORDPRESS:
52-
return new WordpressParser();
58+
case ParserType.WORDPRESS_V1:
59+
return new WordpressV1Parser();
60+
case ParserType.WORDPRESS_V2:
61+
return new WordpressV2Parser();
5362
case ParserType.DYNAMIC:
5463
return new DynamicParser();
5564
}

‎src/modules/workers/parsers/ParserType.ts

+10-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ export enum ParserType {
55
RSS,
66
HTML,
77
DYNAMIC,
8-
WORDPRESS,
8+
WORDPRESS_V1,
9+
WORDPRESS_V2,
910
UNKNOWN
1011
}
1112

@@ -22,8 +23,11 @@ export namespace ParserType {
2223
return ParserType.RSS
2324
case "dynamic":
2425
return ParserType.DYNAMIC
26+
case "wordpress-v1":
27+
return ParserType.WORDPRESS_V1
2528
case "wordpress":
26-
return ParserType.WORDPRESS
29+
case "wordpress-v2":
30+
return ParserType.WORDPRESS_V2
2731
default:
2832
return ParserType.UNKNOWN
2933
}
@@ -37,8 +41,10 @@ export namespace ParserType {
3741
return "rss"
3842
case ParserType.DYNAMIC:
3943
return "dynamic"
40-
case ParserType.WORDPRESS:
41-
return "wordpress"
44+
case ParserType.WORDPRESS_V1:
45+
return "wordpress-v1"
46+
case ParserType.WORDPRESS_V2:
47+
return "wordpress-v2"
4248
default:
4349
return "unknown"
4450
}

‎src/modules/workers/parsers/Utils.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import cheerio from "cheerio";
2+
import {Attachment} from "../../../components/articles";
23

34
const striptags = require('striptags');
45
export default class Utils {
@@ -290,11 +291,11 @@ export default class Utils {
290291
return text.toString()
291292
}
292293

293-
public static extractLinks(html: string): object[] {
294+
public static extractLinks(html: string): Attachment[] {
294295
if(!html || html == '') return [];
295296

296297
const $ = cheerio.load(html);
297-
const links: object[] = [];
298+
const links: Attachment[] = [];
298299

299300
$('a').each((index, element) => {
300301
links.push({

‎src/modules/workers/parsers/drivers/HTMLParser.ts

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import Job from "../../../../components/job";
44
import Article from "../../../../components/articles";
55
import https from "https";
66
import axios, {AxiosRequestConfig, AxiosResponse} from "axios";
7-
import {reject} from "lodash";
87
import cheerio from "cheerio";
98
import Utils from "../Utils";
109
import {AxiosConfig} from "../../../../components/AxiosConfig";

‎src/modules/workers/parsers/drivers/WordpressParser.ts

-93
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import {ParserClass} from "../../ParserClass";
2+
import Instructions from "../../../../../components/instructions";
3+
import Job from "../../../../../components/job";
4+
import Article from "../../../../../components/articles";
5+
import axios, {AxiosRequestConfig} from "axios";
6+
import Utils from "../../Utils";
7+
import https from "https";
8+
import {AxiosConfig} from "../../../../../components/AxiosConfig";
9+
10+
11+
export class WordpressV1Parser extends ParserClass {
12+
validateScrape(scrape: object): void {}
13+
14+
assignInstructions(instructions: Instructions, sourceJson: any): void {
15+
for (let pair of instructions.url) {
16+
if(pair.length == 1)
17+
pair[0] = `${pair[0]}${pair[0].endsWith('/') ? '' : '/'}`
18+
else if(pair.length == 2)
19+
pair[1] = `${pair[1]}${pair[1].endsWith('/') ? '' : '/'}`
20+
21+
}
22+
}
23+
24+
async parse(job: Job, alias: string, url: string, amount: number): Promise<Article[]> {
25+
throw new Error(`WordpressParserV1Exception job failed because it is not implemented yet.`);
26+
}
27+
28+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import {ParserClass} from "../../ParserClass";
2+
import Instructions from "../../../../../components/instructions";
3+
import Job from "../../../../../components/job";
4+
import Article from "../../../../../components/articles";
5+
import axios, {AxiosRequestConfig} from "axios";
6+
import Utils from "../../Utils";
7+
import https from "https";
8+
import {AxiosConfig} from "../../../../../components/AxiosConfig";
9+
10+
const httpsAgent = new https.Agent({rejectUnauthorized: false})
11+
12+
13+
export class WordpressV2Parser extends ParserClass {
14+
validateScrape(scrape: object): void {
15+
}
16+
17+
assignInstructions(instructions: Instructions, sourceJson: any): void {
18+
for (let pair of instructions.url) {
19+
if (pair.length == 1)
20+
pair[0] = `${pair[0]}${pair[0].endsWith('/') ? '' : '/'}`;
21+
else if (pair.length == 2)
22+
pair[1] = `${pair[1]}${pair[1].endsWith('/') ? '' : '/'}`;
23+
}
24+
25+
let artScrapeOpts = sourceJson.scrape?.articles;
26+
let articlesOpts: any = {
27+
include: [],
28+
dates: {},
29+
filter: {},
30+
thumbnail: "thumbnail"
31+
};
32+
33+
if(artScrapeOpts != null) {
34+
if(Array.isArray(sourceJson.scrape?.articles?.include)) {
35+
for(const item of sourceJson.scrape.articles.include) {
36+
if(typeof item === 'string')
37+
articlesOpts.include.push(item)
38+
}
39+
}
40+
41+
if(artScrapeOpts.dates) {
42+
articlesOpts.dates.gmt = typeof artScrapeOpts.dates.gmt === 'boolean' ? artScrapeOpts.dates.gmt : false;
43+
articlesOpts.dates.fallback = typeof artScrapeOpts.dates.fallback === 'boolean' ? artScrapeOpts.dates.fallback : false;
44+
}
45+
46+
if(artScrapeOpts.filter) {
47+
articlesOpts.filter.search = typeof artScrapeOpts.filter?.search === 'string' ? artScrapeOpts.filter.search : null;
48+
articlesOpts.filter.author = typeof artScrapeOpts.filter?.author === 'string' ? artScrapeOpts.filter.author : null;
49+
articlesOpts.filter.authorExclude = typeof artScrapeOpts.filter?.authorExclude === 'string' ? artScrapeOpts.filter.authorExclude : null;
50+
51+
// ISO8601 compliant date
52+
articlesOpts.filter.after = typeof artScrapeOpts.filter?.after === 'string' ? artScrapeOpts.filter.after : null;
53+
articlesOpts.filter.before = typeof artScrapeOpts.filter?.before === 'string' ? artScrapeOpts.filter.before : null;
54+
55+
// offset: typeof articleOptions.filter?.offset === 'number' ? articleOptions.filter.offset : 0,
56+
articlesOpts.filter.slug = typeof artScrapeOpts.filter?.slug === 'string' ? artScrapeOpts.filter.slug : null;
57+
articlesOpts.filter.status = typeof artScrapeOpts.filter?.status === 'string' ? artScrapeOpts.filter.status : null;
58+
articlesOpts.filter.categories = typeof artScrapeOpts.filter?.categories === 'string' ? artScrapeOpts.filter.categories : null;
59+
articlesOpts.filter.categoriesExclude = typeof artScrapeOpts.filter?.categoriesExclude === 'string' ? artScrapeOpts.filter.categoriesExclude : null;
60+
articlesOpts.filter.tags = typeof artScrapeOpts.filter?.tags === 'string' ? artScrapeOpts.filter.tags : null;
61+
articlesOpts.filter.tagsExclude = typeof artScrapeOpts.filter?.tagsExclude === 'string' ? artScrapeOpts.filter.tagsExclude : null;
62+
articlesOpts.filter.sticky = typeof artScrapeOpts.filter?.sticky === 'boolean' ? artScrapeOpts.filter.sticky : null;
63+
}
64+
65+
articlesOpts.thumbnail = typeof artScrapeOpts.thumbnail === 'string' ? artScrapeOpts.thumbnail : 'thumbnail';
66+
}
67+
68+
instructions.scrapeOptions = {
69+
articles: articlesOpts
70+
};
71+
}
72+
73+
async parse(job: Job, alias: string, url: string, amount: number): Promise<Article[]> {
74+
let instructions = job.getInstructions();
75+
76+
let categoriesUrl = `${url}wp-json/wp/v2/categories/`;
77+
let postsUrl = `${url}wp-json/wp/v2/posts?_embed&per_page=${amount}`;
78+
79+
const filters = instructions.scrapeOptions.articles.filter;
80+
if(filters.search) postsUrl +=`&search=${encodeURIComponent(filters.search)}`;
81+
if(filters.author) postsUrl +=`&author=${filters.author}`;
82+
if(filters.authorExclude) postsUrl +=`&author_exclude=${filters.authorExclude}`;
83+
if(filters.after) postsUrl +=`&after=${filters.after}`;
84+
if(filters.before) postsUrl +=`&before=${filters.before}`;
85+
// if(filters.offset != null && filters.offset > 0) postsUrl +=`&offset=${filters.offset}`;
86+
if(filters.slug) postsUrl +=`&slug=${filters.slug}`;
87+
if(filters.status) postsUrl +=`&status=${filters.status}`;
88+
if(filters.categories) postsUrl +=`&categories=${filters.categories}`;
89+
if(filters.categoriesExclude) postsUrl +=`&categories_exclude=${filters.categoriesExclude}`;
90+
if(filters.tags) postsUrl +=`&tags=${filters.tags}`;
91+
if(filters.tagsExclude) postsUrl +=`&tags_exclude=${filters.tagsExclude}`;
92+
if(filters.sticky) postsUrl +=`&_sticky`;
93+
94+
let categories: any
95+
, posts: any[];
96+
97+
let config: AxiosConfig & AxiosRequestConfig = {
98+
timeout: instructions.getSource().timeout,
99+
responseType: 'arraybuffer',
100+
responseEncoding: 'binary'
101+
};
102+
if (instructions["ignoreCertificates"]) config.httpsAgent = httpsAgent;
103+
104+
try {
105+
categories = JSON.parse(instructions.textDecoder.decode((await axios.get(categoriesUrl, config))?.data))
106+
posts = JSON.parse(instructions.textDecoder.decode((await axios.get(postsUrl, config))?.data))
107+
} catch (e: any) {
108+
throw new Error(`WordpressParserException job failed for ${instructions.getSource().name}, original error: ${e.message}`);
109+
}
110+
111+
let articles: Article[] = [];
112+
113+
const parsedCategories = Array.isArray(categories) ?
114+
categories.map((category: any) => {
115+
let links: string[] = []
116+
117+
const linkCatsKeys = Object.keys(category._links)
118+
119+
for (const linkCat of linkCatsKeys) {
120+
for (let href of category._links[linkCat])
121+
links.push(href.href)
122+
}
123+
124+
return {
125+
id: category.id,
126+
description: Utils.htmlStrip(category.description, false),
127+
name: Utils.htmlStrip(category.name, false),
128+
links
129+
}
130+
}) : [];
131+
132+
let count = 0
133+
for (let p of posts) {
134+
if (count >= instructions.amount) continue;
135+
count++;
136+
137+
const article = new Article()
138+
article.setSource(instructions.getSource().getId(), instructions.getSource().name);
139+
article.setTitle(Utils.htmlStrip(p.title.rendered, false));
140+
article.setContent(p.content.rendered);
141+
article.setLink(p.link);
142+
143+
if (instructions.scrapeOptions.articles.dates.gmt) {
144+
if(p.date_gmt != null)
145+
article.setPubDate(p.date_gmt);
146+
else if(instructions.scrapeOptions.articles.dates.fallback)
147+
article.setPubDate(p.date);
148+
} else article.setPubDate(p.date);
149+
150+
article.pushAttachments(Utils.extractLinks(article.content));
151+
152+
for (let cId of p.categories) {
153+
let cat = parsedCategories.find((c: any) => c.id == cId)
154+
if (cat) article.pushCategory(cat.name, cat.links);
155+
}
156+
157+
// Thumbnail
158+
let thumbnailSize = instructions.scrapeOptions.articles.thumbnail;
159+
let thumbnailUrl: string = p._embedded?.['wp:featuredmedia']?.[0]?.media_details?.sizes[thumbnailSize]?.source_url;
160+
161+
article.setThumbnail(thumbnailUrl)
162+
163+
let include: string[] = instructions.scrapeOptions.articles.include;
164+
// The date the object was last modified.
165+
if(include.includes('modified')) {
166+
if (instructions.scrapeOptions.articles.dates.gmt) {
167+
if(p.modified_gmt != null)
168+
article.addExtra('modified', p.modified_gmt);
169+
else if(instructions.scrapeOptions.articles.dates.fallback)
170+
article.addExtra('modified', p.modified);
171+
} else article.addExtra('modified', p.modified);
172+
173+
// Remove it
174+
include = include.filter(s => s !== 'modified');
175+
}
176+
177+
// Can get anything from guid, type, slug to title content etc...
178+
for(const elem of include) {
179+
if(p[elem]?.rendered != null)
180+
article.addExtra(elem, p[elem].rendered);
181+
else article.addExtra(elem, p[elem]);
182+
}
183+
184+
185+
articles.push(article)
186+
}
187+
188+
return articles;
189+
}
190+
191+
}

‎test/index.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ try {
2323
},
2424
sources: {
2525
path: "/test/sources",
26-
// includeOnly: ['rss-cs.unipi.gr'],
26+
// includeOnly: ['wordpress-financeclub.unipi.gr'],
2727
// exclude: ["custom-cs.unipi.gr"]
2828
},
2929
scheduler: {
@@ -65,6 +65,7 @@ try {
6565
await saffron.start()
6666

6767
saffron.on("workers.articles.found", (articles, src) => {
68+
console.log(src, articles.length)
6869
// console.log(util.inspect(articles, {showHidden: false, depth: null, colors: true}));
6970
})
7071

‎test/sources/wordpress-thumbnail.json

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"url": "http://financeclub.unipi.gr",
3+
"name": "wordpress-financeclub.unipi.gr",
4+
"type": "wordpress-v2",
5+
"ignoreCertificates": true,
6+
"scrape": {
7+
"articles": {
8+
"thumbnail": "2048x2048"
9+
}
10+
}
11+
}

‎test/sources/wordpress.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"url": "https://www.ds.unipi.gr/",
33
"name": "wordpress-cs.unipi.gr",
4-
"type": "wordpress"
4+
"type": "wordpress-v2"
55
}
+18-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,22 @@
11
{
22
"url": "https://www.geo.hua.gr/",
33
"name": "geo.hua.gr",
4-
"type": "wordpress",
5-
"ignoreCertificates": true
4+
"type": "wordpress-v2",
5+
"ignoreCertificates": true,
6+
"scrape": {
7+
"articles": {
8+
"include": [
9+
"id",
10+
"date",
11+
"date_gmt",
12+
"modified"
13+
],
14+
"filter": {
15+
"search": "Δηλώσεις",
16+
"author": 4,
17+
"sticky": true,
18+
"offset": 10
19+
}
20+
}
21+
}
622
}

0 commit comments

Comments
 (0)
Please sign in to comment.