1
1
import { EventEmitter } from 'events' ;
2
2
import { URL } from 'url' ;
3
3
import * as http from 'http' ;
4
- import * as fs from 'fs' ;
5
- import * as util from 'util' ;
6
- import * as path from 'path' ;
7
4
8
5
import { request , GaxiosResponse } from 'gaxios' ;
9
- import PQueue , { DefaultAddOptions } from 'p-queue' ;
10
- import PriorityQueue from 'p-queue/dist/priority-queue' ;
11
- import * as globby from 'glob' ;
12
6
7
+ import { Queue } from './queue' ;
13
8
import { getLinks } from './links' ;
14
9
import { startWebServer } from './server' ;
10
+ import { CheckOptions , processOptions } from './options' ;
15
11
16
- const stat = util . promisify ( fs . stat ) ;
17
- const glob = util . promisify ( globby ) ;
18
-
19
- export interface CheckOptions {
20
- concurrency ?: number ;
21
- port ?: number ;
22
- path : string | string [ ] ;
23
- recurse ?: boolean ;
24
- timeout ?: number ;
25
- markdown ?: boolean ;
26
- linksToSkip ?: string [ ] | ( ( link : string ) => Promise < boolean > ) ;
27
- serverRoot ?: string ;
28
- directoryListing ?: boolean ;
29
- }
12
+ export { CheckOptions } ;
30
13
31
14
export enum LinkState {
32
15
OK = 'OK' ,
33
16
BROKEN = 'BROKEN' ,
34
17
SKIPPED = 'SKIPPED' ,
35
18
}
36
19
20
+ export interface RetryInfo {
21
+ url : string ;
22
+ secondsUntilRetry : number ;
23
+ status : number ;
24
+ }
25
+
37
26
export interface LinkResult {
38
27
url : string ;
39
28
status ?: number ;
@@ -53,9 +42,11 @@ interface CrawlOptions {
53
42
crawl : boolean ;
54
43
results : LinkResult [ ] ;
55
44
cache : Set < string > ;
45
+ delayCache : Map < string , number > ;
56
46
checkOptions : CheckOptions ;
57
- queue : PQueue < PriorityQueue , DefaultAddOptions > ;
47
+ queue : Queue ;
58
48
rootPath : string ;
49
+ retry : boolean ;
59
50
}
60
51
61
52
// Spoof a normal looking User-Agent to keep the servers happy
@@ -64,6 +55,12 @@ export const headers = {
64
55
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' ,
65
56
} ;
66
57
58
+ export declare interface LinkChecker {
59
+ on ( event : 'link' , listener : ( result : LinkResult ) => void ) : this;
60
+ on ( event : 'pagestart' , listener : ( link : string ) => void ) : this;
61
+ on ( event : 'retry' , listener : ( details : RetryInfo ) => void ) : this;
62
+ }
63
+
67
64
/**
68
65
* Instance class used to perform a crawl job.
69
66
*/
@@ -74,7 +71,7 @@ export class LinkChecker extends EventEmitter {
74
71
* @param options Options to use while checking for 404s
75
72
*/
76
73
async check ( opts : CheckOptions ) {
77
- const options = await this . processOptions ( opts ) ;
74
+ const options = await processOptions ( opts ) ;
78
75
if ( ! Array . isArray ( options . path ) ) {
79
76
options . path = [ options . path ] ;
80
77
}
@@ -101,12 +98,13 @@ export class LinkChecker extends EventEmitter {
101
98
console . log ( options ) ;
102
99
}
103
100
104
- const queue = new PQueue ( {
101
+ const queue = new Queue ( {
105
102
concurrency : options . concurrency || 100 ,
106
103
} ) ;
107
104
108
105
const results = new Array < LinkResult > ( ) ;
109
106
const initCache : Set < string > = new Set ( ) ;
107
+ const delayCache : Map < string , number > = new Map ( ) ;
110
108
111
109
for ( const path of options . path ) {
112
110
const url = new URL ( path ) ;
@@ -118,8 +116,10 @@ export class LinkChecker extends EventEmitter {
118
116
checkOptions : options ,
119
117
results,
120
118
cache : initCache ,
119
+ delayCache,
121
120
queue,
122
121
rootPath : path ,
122
+ retry : ! ! opts . retry ,
123
123
} ) ;
124
124
} ) ;
125
125
}
@@ -135,121 +135,6 @@ export class LinkChecker extends EventEmitter {
135
135
return result ;
136
136
}
137
137
138
- /**
139
- * Validate the provided flags all work with each other.
140
- * @param options CheckOptions passed in from the CLI (or API)
141
- */
142
- private async processOptions ( opts : CheckOptions ) : Promise < CheckOptions > {
143
- const options = Object . assign ( { } , opts ) ;
144
-
145
- // ensure at least one path is provided
146
- if ( options . path . length === 0 ) {
147
- throw new Error ( 'At least one path must be provided' ) ;
148
- }
149
-
150
- // normalize options.path to an array of strings
151
- if ( ! Array . isArray ( options . path ) ) {
152
- options . path = [ options . path ] ;
153
- }
154
-
155
- // disable directory listings by default
156
- if ( options . directoryListing === undefined ) {
157
- options . directoryListing = false ;
158
- }
159
-
160
- // Ensure we do not mix http:// and file system paths. The paths passed in
161
- // must all be filesystem paths, or HTTP paths.
162
- let isUrlType : boolean | undefined = undefined ;
163
- for ( const path of options . path ) {
164
- const innerIsUrlType = path . startsWith ( 'http' ) ;
165
- if ( isUrlType === undefined ) {
166
- isUrlType = innerIsUrlType ;
167
- } else if ( innerIsUrlType !== isUrlType ) {
168
- throw new Error (
169
- 'Paths cannot be mixed between HTTP and local filesystem paths.'
170
- ) ;
171
- }
172
- }
173
-
174
- // if there is a server root, make sure there are no HTTP paths
175
- if ( options . serverRoot && isUrlType ) {
176
- throw new Error (
177
- "'serverRoot' cannot be defined when the 'path' points to an HTTP endpoint."
178
- ) ;
179
- }
180
-
181
- if ( options . serverRoot ) {
182
- options . serverRoot = path . normalize ( options . serverRoot ) ;
183
- }
184
-
185
- // expand globs into paths
186
- if ( ! isUrlType ) {
187
- const paths : string [ ] = [ ] ;
188
- for ( const filePath of options . path ) {
189
- // The glob path provided is relative to the serverRoot. For example,
190
- // if the serverRoot is test/fixtures/nested, and the glob is "*/*.html",
191
- // The glob needs to be calculated from the serverRoot directory.
192
- const fullPath = options . serverRoot
193
- ? path . join ( options . serverRoot , filePath )
194
- : filePath ;
195
- const expandedPaths = await glob ( fullPath ) ;
196
- if ( expandedPaths . length === 0 ) {
197
- throw new Error (
198
- `The provided glob "${ filePath } " returned 0 results. The current working directory is "${ process . cwd ( ) } ".`
199
- ) ;
200
- }
201
- // After resolving the globs, the paths need to be returned to their
202
- // original form, without the serverRoot included in the path.
203
- for ( let p of expandedPaths ) {
204
- p = path . normalize ( p ) ;
205
- if ( options . serverRoot ) {
206
- const contractedPath = p
207
- . split ( path . sep )
208
- . slice ( options . serverRoot . split ( path . sep ) . length )
209
- . join ( path . sep ) ;
210
- paths . push ( contractedPath ) ;
211
- } else {
212
- paths . push ( p ) ;
213
- }
214
- }
215
- }
216
- options . path = paths ;
217
- }
218
-
219
- // enable markdown if someone passes a flag/glob right at it
220
- if ( options . markdown === undefined ) {
221
- for ( const p of options . path ) {
222
- if ( path . extname ( p ) . toLowerCase ( ) === '.md' ) {
223
- options . markdown = true ;
224
- }
225
- }
226
- }
227
-
228
- // Figure out which directory should be used as the root for the web server,
229
- // and how that impacts the path to the file for the first request.
230
- if ( ! options . serverRoot && ! isUrlType ) {
231
- // if the serverRoot wasn't defined, and there are multiple paths, just
232
- // use process.cwd().
233
- if ( options . path . length > 1 ) {
234
- options . serverRoot = process . cwd ( ) ;
235
- } else {
236
- // if there's a single path, try to be smart and figure it out
237
- const s = await stat ( options . path [ 0 ] ) ;
238
- options . serverRoot = options . path [ 0 ] ;
239
- if ( s . isFile ( ) ) {
240
- const pathParts = options . path [ 0 ] . split ( path . sep ) ;
241
- options . path = [ path . sep + pathParts [ pathParts . length - 1 ] ] ;
242
- options . serverRoot =
243
- pathParts . slice ( 0 , pathParts . length - 1 ) . join ( path . sep ) || '.' ;
244
- } else {
245
- options . serverRoot = options . path [ 0 ] ;
246
- options . path = '/' ;
247
- }
248
- }
249
- }
250
- return options ;
251
- }
252
-
253
138
/**
254
139
* Crawl a given url with the provided options.
255
140
* @pram opts List of options used to do the crawl
@@ -260,7 +145,7 @@ export class LinkChecker extends EventEmitter {
260
145
// explicitly skip non-http[s] links before making the request
261
146
const proto = opts . url . protocol ;
262
147
if ( proto !== 'http:' && proto !== 'https:' ) {
263
- const r = {
148
+ const r : LinkResult = {
264
149
url : opts . url . href ,
265
150
status : 0 ,
266
151
state : LinkState . SKIPPED ,
@@ -306,6 +191,22 @@ export class LinkChecker extends EventEmitter {
306
191
}
307
192
}
308
193
194
+ // Check if this host has been marked for delay due to 429
195
+ if ( opts . delayCache . has ( opts . url . host ) ) {
196
+ const timeout = opts . delayCache . get ( opts . url . host ) ! ;
197
+ if ( timeout > Date . now ( ) ) {
198
+ opts . queue . add (
199
+ async ( ) => {
200
+ await this . crawl ( opts ) ;
201
+ } ,
202
+ {
203
+ delay : timeout - Date . now ( ) ,
204
+ }
205
+ ) ;
206
+ return ;
207
+ }
208
+ }
209
+
309
210
// Perform a HEAD or GET request based on the need to crawl
310
211
let status = 0 ;
311
212
let state = LinkState . BROKEN ;
@@ -322,6 +223,9 @@ export class LinkChecker extends EventEmitter {
322
223
validateStatus : ( ) => true ,
323
224
timeout : opts . checkOptions . timeout ,
324
225
} ) ;
226
+ if ( this . shouldRetryAfter ( res , opts ) ) {
227
+ return ;
228
+ }
325
229
326
230
// If we got an HTTP 405, the server may not like HEAD. GET instead!
327
231
if ( res . status === 405 ) {
@@ -333,6 +237,9 @@ export class LinkChecker extends EventEmitter {
333
237
validateStatus : ( ) => true ,
334
238
timeout : opts . checkOptions . timeout ,
335
239
} ) ;
240
+ if ( this . shouldRetryAfter ( res , opts ) ) {
241
+ return ;
242
+ }
336
243
}
337
244
} catch ( err ) {
338
245
// request failure: invalid domain name, etc.
@@ -355,6 +262,9 @@ export class LinkChecker extends EventEmitter {
355
262
headers,
356
263
timeout : opts . checkOptions . timeout ,
357
264
} ) ;
265
+ if ( this . shouldRetryAfter ( res , opts ) ) {
266
+ return ;
267
+ }
358
268
}
359
269
} catch ( ex ) {
360
270
failures . push ( ex ) ;
@@ -425,17 +335,74 @@ export class LinkChecker extends EventEmitter {
425
335
url : result . url ! ,
426
336
crawl,
427
337
cache : opts . cache ,
338
+ delayCache : opts . delayCache ,
428
339
results : opts . results ,
429
340
checkOptions : opts . checkOptions ,
430
341
queue : opts . queue ,
431
342
parent : opts . url . href ,
432
343
rootPath : opts . rootPath ,
344
+ retry : opts . retry ,
433
345
} ) ;
434
346
} ) ;
435
347
}
436
348
}
437
349
}
438
350
}
351
+ /**
352
+ * Check the incoming response for a `retry-after` header. If present,
353
+ * and if the status was an HTTP 429, calculate the date at which this
354
+ * request should be retried. Ensure the delayCache knows that we're
355
+ * going to wait on requests for this entire host.
356
+ * @param res GaxiosResponse returned from the request
357
+ * @param opts CrawlOptions used during this request
358
+ */
359
+ shouldRetryAfter ( res : GaxiosResponse , opts : CrawlOptions ) : boolean {
360
+ if ( ! opts . retry ) {
361
+ return false ;
362
+ }
363
+
364
+ const retryAfterRaw = res . headers [ 'retry-after' ] ;
365
+ if ( res . status !== 429 || ! retryAfterRaw ) {
366
+ return false ;
367
+ }
368
+
369
+ // The `retry-after` header can come in either <seconds> or
370
+ // A specific date to go check.
371
+ let retryAfter = Number ( retryAfterRaw ) * 1000 + Date . now ( ) ;
372
+ if ( isNaN ( retryAfter ) ) {
373
+ retryAfter = Date . parse ( retryAfterRaw ) ;
374
+ if ( isNaN ( retryAfter ) ) {
375
+ return false ;
376
+ }
377
+ }
378
+
379
+ // check to see if there is already a request to wait for this host
380
+ if ( opts . delayCache . has ( opts . url . host ) ) {
381
+ // use whichever time is higher in the cache
382
+ const currentTimeout = opts . delayCache . get ( opts . url . host ) ! ;
383
+ if ( retryAfter > currentTimeout ) {
384
+ opts . delayCache . set ( opts . url . host , retryAfter ) ;
385
+ }
386
+ } else {
387
+ opts . delayCache . set ( opts . url . host , retryAfter ) ;
388
+ }
389
+
390
+ opts . queue . add (
391
+ async ( ) => {
392
+ await this . crawl ( opts ) ;
393
+ } ,
394
+ {
395
+ delay : retryAfter - Date . now ( ) ,
396
+ }
397
+ ) ;
398
+ const retryDetails : RetryInfo = {
399
+ url : opts . url . href ,
400
+ status : res . status ,
401
+ secondsUntilRetry : Math . round ( ( retryAfter - Date . now ( ) ) / 1000 ) ,
402
+ } ;
403
+ this . emit ( 'retry' , retryDetails ) ;
404
+ return true ;
405
+ }
439
406
}
440
407
441
408
/**
0 commit comments