1
+ import { ParserClass } from "../../ParserClass" ;
2
+ import Instructions from "../../../../../components/instructions" ;
3
+ import Job from "../../../../../components/job" ;
4
+ import Article from "../../../../../components/articles" ;
5
+ import axios , { AxiosRequestConfig } from "axios" ;
6
+ import Utils from "../../Utils" ;
7
+ import https from "https" ;
8
+ import { AxiosConfig } from "../../../../../components/AxiosConfig" ;
9
+
10
+ const httpsAgent = new https . Agent ( { rejectUnauthorized : false } )
11
+
12
+
13
+ export class WordpressV2Parser extends ParserClass {
14
+ validateScrape ( scrape : object ) : void {
15
+ }
16
+
17
+ assignInstructions ( instructions : Instructions , sourceJson : any ) : void {
18
+ for ( let pair of instructions . url ) {
19
+ if ( pair . length == 1 )
20
+ pair [ 0 ] = `${ pair [ 0 ] } ${ pair [ 0 ] . endsWith ( '/' ) ? '' : '/' } ` ;
21
+ else if ( pair . length == 2 )
22
+ pair [ 1 ] = `${ pair [ 1 ] } ${ pair [ 1 ] . endsWith ( '/' ) ? '' : '/' } ` ;
23
+ }
24
+
25
+ let artScrapeOpts = sourceJson . scrape ?. articles ;
26
+ let articlesOpts : any = {
27
+ include : [ ] ,
28
+ dates : { } ,
29
+ filter : { } ,
30
+ thumbnail : "thumbnail"
31
+ } ;
32
+
33
+ if ( artScrapeOpts != null ) {
34
+ if ( Array . isArray ( sourceJson . scrape ?. articles ?. include ) ) {
35
+ for ( const item of sourceJson . scrape . articles . include ) {
36
+ if ( typeof item === 'string' )
37
+ articlesOpts . include . push ( item )
38
+ }
39
+ }
40
+
41
+ if ( artScrapeOpts . dates ) {
42
+ articlesOpts . dates . gmt = typeof artScrapeOpts . dates . gmt === 'boolean' ? artScrapeOpts . dates . gmt : false ;
43
+ articlesOpts . dates . fallback = typeof artScrapeOpts . dates . fallback === 'boolean' ? artScrapeOpts . dates . fallback : false ;
44
+ }
45
+
46
+ if ( artScrapeOpts . filter ) {
47
+ articlesOpts . filter . search = typeof artScrapeOpts . filter ?. search === 'string' ? artScrapeOpts . filter . search : null ;
48
+ articlesOpts . filter . author = typeof artScrapeOpts . filter ?. author === 'string' ? artScrapeOpts . filter . author : null ;
49
+ articlesOpts . filter . authorExclude = typeof artScrapeOpts . filter ?. authorExclude === 'string' ? artScrapeOpts . filter . authorExclude : null ;
50
+
51
+ // ISO8601 compliant date
52
+ articlesOpts . filter . after = typeof artScrapeOpts . filter ?. after === 'string' ? artScrapeOpts . filter . after : null ;
53
+ articlesOpts . filter . before = typeof artScrapeOpts . filter ?. before === 'string' ? artScrapeOpts . filter . before : null ;
54
+
55
+ // offset: typeof articleOptions.filter?.offset === 'number' ? articleOptions.filter.offset : 0,
56
+ articlesOpts . filter . slug = typeof artScrapeOpts . filter ?. slug === 'string' ? artScrapeOpts . filter . slug : null ;
57
+ articlesOpts . filter . status = typeof artScrapeOpts . filter ?. status === 'string' ? artScrapeOpts . filter . status : null ;
58
+ articlesOpts . filter . categories = typeof artScrapeOpts . filter ?. categories === 'string' ? artScrapeOpts . filter . categories : null ;
59
+ articlesOpts . filter . categoriesExclude = typeof artScrapeOpts . filter ?. categoriesExclude === 'string' ? artScrapeOpts . filter . categoriesExclude : null ;
60
+ articlesOpts . filter . tags = typeof artScrapeOpts . filter ?. tags === 'string' ? artScrapeOpts . filter . tags : null ;
61
+ articlesOpts . filter . tagsExclude = typeof artScrapeOpts . filter ?. tagsExclude === 'string' ? artScrapeOpts . filter . tagsExclude : null ;
62
+ articlesOpts . filter . sticky = typeof artScrapeOpts . filter ?. sticky === 'boolean' ? artScrapeOpts . filter . sticky : null ;
63
+ }
64
+
65
+ articlesOpts . thumbnail = typeof artScrapeOpts . thumbnail === 'string' ? artScrapeOpts . thumbnail : 'thumbnail' ;
66
+ }
67
+
68
+ instructions . scrapeOptions = {
69
+ articles : articlesOpts
70
+ } ;
71
+ }
72
+
73
+ async parse ( job : Job , alias : string , url : string , amount : number ) : Promise < Article [ ] > {
74
+ let instructions = job . getInstructions ( ) ;
75
+
76
+ let categoriesUrl = `${ url } wp-json/wp/v2/categories/` ;
77
+ let postsUrl = `${ url } wp-json/wp/v2/posts?_embed&per_page=${ amount } ` ;
78
+
79
+ const filters = instructions . scrapeOptions . articles . filter ;
80
+ if ( filters . search ) postsUrl += `&search=${ encodeURIComponent ( filters . search ) } ` ;
81
+ if ( filters . author ) postsUrl += `&author=${ filters . author } ` ;
82
+ if ( filters . authorExclude ) postsUrl += `&author_exclude=${ filters . authorExclude } ` ;
83
+ if ( filters . after ) postsUrl += `&after=${ filters . after } ` ;
84
+ if ( filters . before ) postsUrl += `&before=${ filters . before } ` ;
85
+ // if(filters.offset != null && filters.offset > 0) postsUrl +=`&offset=${filters.offset}`;
86
+ if ( filters . slug ) postsUrl += `&slug=${ filters . slug } ` ;
87
+ if ( filters . status ) postsUrl += `&status=${ filters . status } ` ;
88
+ if ( filters . categories ) postsUrl += `&categories=${ filters . categories } ` ;
89
+ if ( filters . categoriesExclude ) postsUrl += `&categories_exclude=${ filters . categoriesExclude } ` ;
90
+ if ( filters . tags ) postsUrl += `&tags=${ filters . tags } ` ;
91
+ if ( filters . tagsExclude ) postsUrl += `&tags_exclude=${ filters . tagsExclude } ` ;
92
+ if ( filters . sticky ) postsUrl += `&_sticky` ;
93
+
94
+ let categories : any
95
+ , posts : any [ ] ;
96
+
97
+ let config : AxiosConfig & AxiosRequestConfig = {
98
+ timeout : instructions . getSource ( ) . timeout ,
99
+ responseType : 'arraybuffer' ,
100
+ responseEncoding : 'binary'
101
+ } ;
102
+ if ( instructions [ "ignoreCertificates" ] ) config . httpsAgent = httpsAgent ;
103
+
104
+ try {
105
+ categories = JSON . parse ( instructions . textDecoder . decode ( ( await axios . get ( categoriesUrl , config ) ) ?. data ) )
106
+ posts = JSON . parse ( instructions . textDecoder . decode ( ( await axios . get ( postsUrl , config ) ) ?. data ) )
107
+ } catch ( e : any ) {
108
+ throw new Error ( `WordpressParserException job failed for ${ instructions . getSource ( ) . name } , original error: ${ e . message } ` ) ;
109
+ }
110
+
111
+ let articles : Article [ ] = [ ] ;
112
+
113
+ const parsedCategories = Array . isArray ( categories ) ?
114
+ categories . map ( ( category : any ) => {
115
+ let links : string [ ] = [ ]
116
+
117
+ const linkCatsKeys = Object . keys ( category . _links )
118
+
119
+ for ( const linkCat of linkCatsKeys ) {
120
+ for ( let href of category . _links [ linkCat ] )
121
+ links . push ( href . href )
122
+ }
123
+
124
+ return {
125
+ id : category . id ,
126
+ description : Utils . htmlStrip ( category . description , false ) ,
127
+ name : Utils . htmlStrip ( category . name , false ) ,
128
+ links
129
+ }
130
+ } ) : [ ] ;
131
+
132
+ let count = 0
133
+ for ( let p of posts ) {
134
+ if ( count >= instructions . amount ) continue ;
135
+ count ++ ;
136
+
137
+ const article = new Article ( )
138
+ article . setSource ( instructions . getSource ( ) . getId ( ) , instructions . getSource ( ) . name ) ;
139
+ article . setTitle ( Utils . htmlStrip ( p . title . rendered , false ) ) ;
140
+ article . setContent ( p . content . rendered ) ;
141
+ article . setLink ( p . link ) ;
142
+
143
+ if ( instructions . scrapeOptions . articles . dates . gmt ) {
144
+ if ( p . date_gmt != null )
145
+ article . setPubDate ( p . date_gmt ) ;
146
+ else if ( instructions . scrapeOptions . articles . dates . fallback )
147
+ article . setPubDate ( p . date ) ;
148
+ } else article . setPubDate ( p . date ) ;
149
+
150
+ article . pushAttachments ( Utils . extractLinks ( article . content ) ) ;
151
+
152
+ for ( let cId of p . categories ) {
153
+ let cat = parsedCategories . find ( ( c : any ) => c . id == cId )
154
+ if ( cat ) article . pushCategory ( cat . name , cat . links ) ;
155
+ }
156
+
157
+ // Thumbnail
158
+ let thumbnailSize = instructions . scrapeOptions . articles . thumbnail ;
159
+ let thumbnailUrl : string = p . _embedded ?. [ 'wp:featuredmedia' ] ?. [ 0 ] ?. media_details ?. sizes [ thumbnailSize ] ?. source_url ;
160
+
161
+ article . setThumbnail ( thumbnailUrl )
162
+
163
+ let include : string [ ] = instructions . scrapeOptions . articles . include ;
164
+ // The date the object was last modified.
165
+ if ( include . includes ( 'modified' ) ) {
166
+ if ( instructions . scrapeOptions . articles . dates . gmt ) {
167
+ if ( p . modified_gmt != null )
168
+ article . addExtra ( 'modified' , p . modified_gmt ) ;
169
+ else if ( instructions . scrapeOptions . articles . dates . fallback )
170
+ article . addExtra ( 'modified' , p . modified ) ;
171
+ } else article . addExtra ( 'modified' , p . modified ) ;
172
+
173
+ // Remove it
174
+ include = include . filter ( s => s !== 'modified' ) ;
175
+ }
176
+
177
+ // Can get anything from guid, type, slug to title content etc...
178
+ for ( const elem of include ) {
179
+ if ( p [ elem ] ?. rendered != null )
180
+ article . addExtra ( elem , p [ elem ] . rendered ) ;
181
+ else article . addExtra ( elem , p [ elem ] ) ;
182
+ }
183
+
184
+
185
+ articles . push ( article )
186
+ }
187
+
188
+ return articles ;
189
+ }
190
+
191
+ }
0 commit comments