Skip to content

Commit 6e33b39

Browse files
authoredOct 5, 2021
fix(perf): use request streams to reduce memory usage (#336)
1 parent ce88410 commit 6e33b39

File tree

2 files changed

+23
-17
lines changed

2 files changed

+23
-17
lines changed
 

‎src/index.ts

+12-11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import {EventEmitter} from 'events';
22
import {URL} from 'url';
33
import * as http from 'http';
44
import * as path from 'path';
5+
import {Readable} from 'stream';
56

67
import {request, GaxiosResponse} from 'gaxios';
78

@@ -222,16 +223,15 @@ export class LinkChecker extends EventEmitter {
222223
// Perform a HEAD or GET request based on the need to crawl
223224
let status = 0;
224225
let state = LinkState.BROKEN;
225-
let data = '';
226226
let shouldRecurse = false;
227-
let res: GaxiosResponse<string> | undefined = undefined;
227+
let res: GaxiosResponse<Readable> | undefined = undefined;
228228
const failures: {}[] = [];
229229
try {
230-
res = await request<string>({
230+
res = await request<Readable>({
231231
method: opts.crawl ? 'GET' : 'HEAD',
232232
url: opts.url.href,
233233
headers,
234-
responseType: opts.crawl ? 'text' : 'stream',
234+
responseType: 'stream',
235235
validateStatus: () => true,
236236
timeout: opts.checkOptions.timeout,
237237
});
@@ -241,7 +241,7 @@ export class LinkChecker extends EventEmitter {
241241

242242
// If we got an HTTP 405, the server may not like HEAD. GET instead!
243243
if (res.status === 405) {
244-
res = await request<string>({
244+
res = await request<Readable>({
245245
method: 'GET',
246246
url: opts.url.href,
247247
headers,
@@ -257,7 +257,7 @@ export class LinkChecker extends EventEmitter {
257257
// request failure: invalid domain name, etc.
258258
// this also occasionally catches too many redirects, but is still valid (e.g. https://www.ebay.com)
259259
// for this reason, we also try doing a GET below to see if the link is valid
260-
failures.push(err);
260+
failures.push(err as Error);
261261
}
262262

263263
try {
@@ -266,10 +266,10 @@ export class LinkChecker extends EventEmitter {
266266
(res === undefined || res.status < 200 || res.status >= 300) &&
267267
!opts.crawl
268268
) {
269-
res = await request<string>({
269+
res = await request<Readable>({
270270
method: 'GET',
271271
url: opts.url.href,
272-
responseType: 'text',
272+
responseType: 'stream',
273273
validateStatus: () => true,
274274
headers,
275275
timeout: opts.checkOptions.timeout,
@@ -279,13 +279,12 @@ export class LinkChecker extends EventEmitter {
279279
}
280280
}
281281
} catch (ex) {
282-
failures.push(ex);
282+
failures.push(ex as Error);
283283
// catch the next failure
284284
}
285285

286286
if (res !== undefined) {
287287
status = res.status;
288-
data = res.data;
289288
shouldRecurse = isHtml(res);
290289
}
291290

@@ -309,7 +308,9 @@ export class LinkChecker extends EventEmitter {
309308
// If we need to go deeper, scan the next level of depth for links and crawl
310309
if (opts.crawl && shouldRecurse) {
311310
this.emit('pagestart', opts.url);
312-
const urlResults = getLinks(data, opts.url.href);
311+
const urlResults = res?.data
312+
? await getLinks(res.data, opts.url.href)
313+
: [];
313314
for (const result of urlResults) {
314315
// if there was some sort of problem parsing the link while
315316
// creating a new URL obj, treat it as a broken link.

‎src/links.ts

+11-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import * as htmlParser from 'htmlparser2';
1+
import * as htmlParser from 'htmlparser2/lib/WritableStream';
2+
import {Readable} from 'stream';
23
import {URL} from 'url';
34

45
const linksAttr = {
@@ -42,11 +43,14 @@ export interface ParsedUrl {
4243
url?: URL;
4344
}
4445

45-
export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
46+
export async function getLinks(
47+
source: Readable,
48+
baseUrl: string
49+
): Promise<ParsedUrl[]> {
4650
let realBaseUrl = baseUrl;
4751
let baseSet = false;
4852
const links = new Array<ParsedUrl>();
49-
const parser = new htmlParser.Parser({
53+
const parser = new htmlParser.WritableStream({
5054
onopentag(tag: string, attributes: {[s: string]: string}) {
5155
// Allow alternate base URL to be specified in tag:
5256
if (tag === 'base' && !baseSet) {
@@ -79,8 +83,9 @@ export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
7983
}
8084
},
8185
});
82-
parser.write(source);
83-
parser.end();
86+
await new Promise((resolve, reject) => {
87+
source.pipe(parser).on('finish', resolve).on('error', reject);
88+
});
8489
return links;
8590
}
8691

@@ -110,6 +115,6 @@ function parseLink(link: string, baseUrl: string): ParsedUrl {
110115
url.hash = '';
111116
return {link, url};
112117
} catch (error) {
113-
return {link, error};
118+
return {link, error: error as Error};
114119
}
115120
}

0 commit comments

Comments
 (0)
Please sign in to comment.