Skip to content

Commit 0356fcc

Browse files
committedMar 8, 2022
Merge remote-tracking branch 'origin/dev' into dev
2 parents 4dd6a7b + 15d9901 commit 0356fcc

File tree

10 files changed

+89
-2674
lines changed

10 files changed

+89
-2674
lines changed
 

‎package-lock.json

-2,664
This file was deleted.

‎src/components/AxiosConfig.ts

+2
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,7 @@ export interface AxiosConfig {
66
timeout: number,
77
url?: string,
88
method?: string
9+
responseType: string,
10+
responseEncoding: string
911

1012
}

‎src/components/instructions.ts

+3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export default class Instructions {
1818
declare scrapeOptions: any;
1919
declare elementSelector: string;
2020
declare scrapeFunction: string;
21+
declare textDecoder: TextDecoder;
2122
declare ignoreCertificates: boolean;
2223
declare extraFields: string[];
2324

@@ -60,6 +61,7 @@ export default class Instructions {
6061
elementSelector: this.elementSelector,
6162
scrapeFunction: this.scrapeFunction,
6263
ignoreCertificates: this.ignoreCertificates,
64+
textDecoder: this.textDecoder,
6365
extraFields: this.extraFields
6466
}
6567
}
@@ -75,6 +77,7 @@ export default class Instructions {
7577
inst.elementSelector = json.elementSelector
7678
inst.scrapeFunction = json.scrapeFunction
7779
inst.ignoreCertificates = json.ignoreCertificates
80+
inst.textDecoder = json.textDecoder
7881
inst.extraFields = json.extraFields
7982

8083
return inst

‎src/components/source.ts

+4
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ export default class Source {
6666
throw new Error(`SourceException: ${source.filename}: ignoreCertificates: is not valid, must be boolean.`);
6767
ret.instructions.ignoreCertificates = source.ignoreCertificates ? source.ignoreCertificates : false;
6868

69+
if (typeof source.encoding !== 'undefined' && typeof source.encoding !== 'string')
70+
throw new Error(`SourceException: ${source.filename}: encoding: is not valid, must be string value.`);
71+
ret.instructions.textDecoder = source.encoding ? new TextDecoder(`${source.encoding}`) : new TextDecoder();
72+
6973
ret.extra = source.extra;
7074

7175
ret.instructions.url = [];

‎src/modules/database/database.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ export default abstract class Database {
8686
// And then check if they already exist.
8787
let hashes = dbArticles.map((article: Article) => article.getHash());
8888
articles = articles.filter((article: Article) => !hashes.includes(article.getHash()));
89-
Events.emit("workers.articles.new", articles); // Can be empty array
89+
Events.emit("workers.articles.new", articles,src); // Can be empty array
9090

9191
for (const article of articles) {
9292
let added = await this.pushArticle(src, article);

‎src/modules/events.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ export default class Events {
7676

7777
this.getAntennae().on("workers.articles.found", (articles: Article[], src: string) =>
7878
Logger(LoggerTypes.DEBUG, `${chalk.cyan('Articles')} - Finished job returned ${articles.length} articles for ${src}.`));
79-
this.getAntennae().on("workers.articles.new", (articles: Article[]) =>
80-
Logger(LoggerTypes.INFO, `${chalk.cyan('Articles')} - ${articles.length} articles will be added to to the db for ${articles[0].getSource().name}.`));
79+
this.getAntennae().on("workers.articles.new", (articles: Article[],src: string) =>
80+
Logger(LoggerTypes.INFO, `${chalk.cyan('Articles')} - ${articles.length} articles will be added to to the db for ${src}.`));
8181
}
8282

8383
if(logLevel === 'all' || logLevel === 'info' || logLevel === 'errors') {

‎src/modules/workers/parsers/drivers/HTMLParser.ts

+15-4
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,14 @@ export class HTMLParser extends ParserClass {
2626
}
2727

2828
private static async request(url: string, timeout: number,instructions: Instructions): Promise<AxiosResponse> {
29-
return new Promise((resolve) => {
29+
return new Promise((resolve,reject) => {
3030

3131
let config : AxiosConfig = {
3232
method: 'get',
3333
url,
3434
timeout,
35+
responseType: 'arraybuffer',
36+
responseEncoding: 'binary'
3537
}
3638

3739
if(instructions["ignoreCertificates"]) config.httpsAgent = httpsAgent
@@ -57,7 +59,16 @@ export class HTMLParser extends ParserClass {
5759
private static attributes(location: cheerio.Cheerio,
5860
dataStoredAt: string,
5961
attributesArr: Array<string>): Array<Object> | null {
60-
62+
//Search into same element if Instructions(Find = null) and class is the same
63+
if(!dataStoredAt || dataStoredAt.length <= 0){
64+
return attributesArr.filter(item => location.attr(item)).map(item => {
65+
return {
66+
attribute: item, //attribute
67+
value: (location.attr(item)) ? location.attr(item) : "", //value_of__requested_attribute
68+
text: (location.text()) ? location.text() : "", //tag value
69+
}
70+
})
71+
}
6172
return attributesArr.filter(item => location.find(dataStoredAt).attr(item)).map(item => {
6273
return {
6374
attribute: item, //attribute
@@ -165,7 +176,7 @@ export class HTMLParser extends ParserClass {
165176

166177
await HTMLParser.request(url, instructions.getSource().timeout,instructions)
167178
.then((response: AxiosResponse) => {
168-
const cheerioLoad: cheerio.Root = cheerio.load(response.data)
179+
const cheerioLoad: cheerio.Root = cheerio.load( instructions.textDecoder.decode(response.data))
169180

170181
// for each article.
171182
cheerioLoad(`${instructions.elementSelector}`).each((index, element) => {
@@ -178,7 +189,7 @@ export class HTMLParser extends ParserClass {
178189

179190
// for each option. The options provided by instructions.
180191
for (let item in options) {
181-
if(options.hasOwnProperty(item) && !options[item].find && !options[item].multiple){
192+
if(options.hasOwnProperty(item) && !options[item].find && !options[item].multiple && !options[item].attributes){
182193
articleData[item] = cheerioLoad(element).find(options[item].class).text()
183194
}else if(options.hasOwnProperty(item)) {
184195
if (!options[item].attributes)

‎src/modules/workers/parsers/drivers/WordpressParser.ts

+5-3
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,15 @@ export class WordpressParser extends ParserClass {
3131
let categories: any, posts: any[];
3232

3333
let config: (AxiosConfig) = {
34-
timeout: instructions.getSource().timeout
34+
timeout: instructions.getSource().timeout,
35+
responseType: 'arraybuffer',
36+
responseEncoding: 'binary'
3537
}
3638
if(instructions["ignoreCertificates"]) config.httpsAgent = httpsAgent
3739

3840
try {
39-
categories = (await axios.get(categoriesUrl, (config as AxiosRequestConfig)))?.data
40-
posts = (await axios.get(postsUrl, (config as AxiosRequestConfig)))?.data
41+
categories = JSON.parse(instructions.textDecoder.decode((await axios.get(categoriesUrl, (config as AxiosRequestConfig)))?.data))
42+
posts = JSON.parse(instructions.textDecoder.decode((await axios.get(postsUrl, (config as AxiosRequestConfig)))?.data))
4143
} catch (e: any) {
4244
throw new Error(`WordpressParserException job failed for ${instructions.getSource().name}, original error: ${e.message}`);
4345
}

‎test/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ try {
5555
}
5656
}
5757

58+
5859
(async () => {
5960
Error.stackTraceLimit = 200;
6061
let errors = []
+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"url": [
3+
["Γενικές Ανακοινώσεις","http://www.naval.ntua.gr/news"]
4+
],
5+
"name": "naval.ntua.gr",
6+
"encoding": "iso-8859-7",
7+
"type": "html",
8+
"scrape": {
9+
"container": "#page-content > .news_container",
10+
"article": {
11+
"link": {
12+
"class": ".catItemTitle",
13+
"find": [
14+
"a"
15+
],
16+
"attributes": [
17+
"href"
18+
],
19+
"multiple": false
20+
},
21+
"pubDate": {
22+
"class": ".catItemDateCreated",
23+
"attributes": [
24+
"value",
25+
"href"
26+
],
27+
"find": null,
28+
"multiple": false
29+
},
30+
"title": {
31+
"class": ".news_title",
32+
"find": null,
33+
"multiple": false
34+
},
35+
"content": {
36+
"class": ".catItemIntroText",
37+
"find": null,
38+
"multiple": false
39+
},
40+
"attachments": {
41+
"class": ".catItemLinks",
42+
"attributes": [
43+
"value",
44+
"href",
45+
"title"
46+
],
47+
"find": [
48+
".catItemAttachmentsBlock",
49+
"li",
50+
"a"
51+
],
52+
"multiple": true
53+
}
54+
}
55+
}
56+
}

0 commit comments

Comments
 (0)
Please sign in to comment.