From 0d5239f79403dcc406e33e97268f0bec5fed7d70 Mon Sep 17 00:00:00 2001 From: harold Date: Sat, 15 Jan 2022 15:15:24 -0500 Subject: [PATCH] Byte count and item count limits --- index.ts | 1 + lib/sitemap-item-stream.ts | 472 ++++++++++++++++++----------------- lib/sitemap-stream.ts | 114 +++++++-- tests/sitemap-stream.test.ts | 135 +++++++++- 4 files changed, 475 insertions(+), 247 deletions(-) diff --git a/index.ts b/index.ts index 9346298a..5f1f14b9 100644 --- a/index.ts +++ b/index.ts @@ -6,6 +6,7 @@ export { SitemapItemStream, SitemapItemStreamOptions, + SitemapItemToXMLString, } from './lib/sitemap-item-stream'; export { IndexTagNames, diff --git a/lib/sitemap-item-stream.ts b/lib/sitemap-item-stream.ts index 92ec40a1..6fde80a5 100644 --- a/lib/sitemap-item-stream.ts +++ b/lib/sitemap-item-stream.ts @@ -30,287 +30,299 @@ function attrBuilder( }, iv); } -export interface SitemapItemStreamOptions extends TransformOptions { - level?: ErrorLevel; -} - /** - * Takes a stream of SitemapItemOptions and spits out xml for each - * @example - * // writes https://example.comhttps://example.com/2 - * const smis = new SitemapItemStream({level: 'warn'}) - * smis.pipe(writestream) - * smis.write({url: 'https://example.com', img: [], video: [], links: []}) - * smis.write({url: 'https://example.com/2', img: [], video: [], links: []}) - * smis.end() - * @param level - Error level + * Serializes a SitemapItem into an XML String + * @param item Item to serialize + * @returns string */ -export class SitemapItemStream extends Transform { - level: ErrorLevel; - constructor(opts: SitemapItemStreamOptions = { level: ErrorLevel.WARN }) { - opts.objectMode = true; - super(opts); - this.level = opts.level || ErrorLevel.WARN; +export function SitemapItemToXMLString(item: SitemapItem): string { + const tagParts: string[] = []; + + tagParts.push(otag(TagNames.url)); + tagParts.push(element(TagNames.loc, item.url)); + + if (item.lastmod) { + tagParts.push(element(TagNames.lastmod, item.lastmod)); } - _transform( - item: SitemapItem, - encoding: string, - callback: TransformCallback - ): void { - this.push(otag(TagNames.url)); - this.push(element(TagNames.loc, item.url)); + if (item.changefreq) { + tagParts.push(element(TagNames.changefreq, item.changefreq)); + } - if (item.lastmod) { - this.push(element(TagNames.lastmod, item.lastmod)); + if (item.priority !== undefined && item.priority !== null) { + if (item.fullPrecisionPriority) { + tagParts.push(element(TagNames.priority, item.priority.toString())); + } else { + tagParts.push(element(TagNames.priority, item.priority.toFixed(1))); } + } - if (item.changefreq) { - this.push(element(TagNames.changefreq, item.changefreq)); - } + item.video.forEach((video) => { + tagParts.push(otag(TagNames['video:video'])); - if (item.priority !== undefined && item.priority !== null) { - if (item.fullPrecisionPriority) { - this.push(element(TagNames.priority, item.priority.toString())); - } else { - this.push(element(TagNames.priority, item.priority.toFixed(1))); - } + tagParts.push( + element(TagNames['video:thumbnail_loc'], video.thumbnail_loc) + ); + tagParts.push(element(TagNames['video:title'], video.title)); + tagParts.push(element(TagNames['video:description'], video.description)); + + if (video.content_loc) { + tagParts.push(element(TagNames['video:content_loc'], video.content_loc)); } - item.video.forEach((video) => { - this.push(otag(TagNames['video:video'])); + if (video.player_loc) { + tagParts.push( + element( + TagNames['video:player_loc'], + attrBuilder(video, ['player_loc:autoplay', 'player_loc:allow_embed']), + video.player_loc + ) + ); + } - this.push(element(TagNames['video:thumbnail_loc'], video.thumbnail_loc)); - this.push(element(TagNames['video:title'], video.title)); - this.push(element(TagNames['video:description'], video.description)); + if (video.duration) { + tagParts.push( + element(TagNames['video:duration'], video.duration.toString()) + ); + } - if (video.content_loc) { - this.push(element(TagNames['video:content_loc'], video.content_loc)); - } + if (video.expiration_date) { + tagParts.push( + element(TagNames['video:expiration_date'], video.expiration_date) + ); + } - if (video.player_loc) { - this.push( - element( - TagNames['video:player_loc'], - attrBuilder(video, [ - 'player_loc:autoplay', - 'player_loc:allow_embed', - ]), - video.player_loc - ) - ); - } + if (video.rating !== undefined) { + tagParts.push(element(TagNames['video:rating'], video.rating.toString())); + } - if (video.duration) { - this.push( - element(TagNames['video:duration'], video.duration.toString()) - ); - } + if (video.view_count !== undefined) { + tagParts.push( + element(TagNames['video:view_count'], video.view_count.toString()) + ); + } - if (video.expiration_date) { - this.push( - element(TagNames['video:expiration_date'], video.expiration_date) - ); - } + if (video.publication_date) { + tagParts.push( + element(TagNames['video:publication_date'], video.publication_date) + ); + } - if (video.rating !== undefined) { - this.push(element(TagNames['video:rating'], video.rating.toString())); - } + for (const tag of video.tag) { + tagParts.push(element(TagNames['video:tag'], tag)); + } - if (video.view_count !== undefined) { - this.push( - element(TagNames['video:view_count'], video.view_count.toString()) - ); - } + if (video.category) { + tagParts.push(element(TagNames['video:category'], video.category)); + } - if (video.publication_date) { - this.push( - element(TagNames['video:publication_date'], video.publication_date) - ); - } + if (video.family_friendly) { + tagParts.push( + element(TagNames['video:family_friendly'], video.family_friendly) + ); + } - for (const tag of video.tag) { - this.push(element(TagNames['video:tag'], tag)); - } + if (video.restriction) { + tagParts.push( + element( + TagNames['video:restriction'], + attrBuilder(video, 'restriction:relationship'), + video.restriction + ) + ); + } - if (video.category) { - this.push(element(TagNames['video:category'], video.category)); - } + if (video.gallery_loc) { + tagParts.push( + element( + TagNames['video:gallery_loc'], + { title: video['gallery_loc:title'] }, + video.gallery_loc + ) + ); + } - if (video.family_friendly) { - this.push( - element(TagNames['video:family_friendly'], video.family_friendly) - ); - } + if (video.price) { + tagParts.push( + element( + TagNames['video:price'], + attrBuilder(video, [ + 'price:resolution', + 'price:currency', + 'price:type', + ]), + video.price + ) + ); + } - if (video.restriction) { - this.push( - element( - TagNames['video:restriction'], - attrBuilder(video, 'restriction:relationship'), - video.restriction - ) - ); - } + if (video.requires_subscription) { + tagParts.push( + element( + TagNames['video:requires_subscription'], + video.requires_subscription + ) + ); + } - if (video.gallery_loc) { - this.push( - element( - TagNames['video:gallery_loc'], - { title: video['gallery_loc:title'] }, - video.gallery_loc - ) - ); - } + if (video.uploader) { + tagParts.push( + element( + TagNames['video:uploader'], + attrBuilder(video, 'uploader:info'), + video.uploader + ) + ); + } - if (video.price) { - this.push( - element( - TagNames['video:price'], - attrBuilder(video, [ - 'price:resolution', - 'price:currency', - 'price:type', - ]), - video.price - ) - ); - } + if (video.platform) { + tagParts.push( + element( + TagNames['video:platform'], + attrBuilder(video, 'platform:relationship'), + video.platform + ) + ); + } - if (video.requires_subscription) { - this.push( - element( - TagNames['video:requires_subscription'], - video.requires_subscription - ) - ); - } + if (video.live) { + tagParts.push(element(TagNames['video:live'], video.live)); + } - if (video.uploader) { - this.push( - element( - TagNames['video:uploader'], - attrBuilder(video, 'uploader:info'), - video.uploader - ) - ); - } + if (video.id) { + tagParts.push(element(TagNames['video:id'], { type: 'url' }, video.id)); + } - if (video.platform) { - this.push( - element( - TagNames['video:platform'], - attrBuilder(video, 'platform:relationship'), - video.platform - ) - ); - } + tagParts.push(ctag(TagNames['video:video'])); + }); + + item.links.forEach((link) => { + tagParts.push( + element(TagNames['xhtml:link'], { + rel: 'alternate', + hreflang: link.lang || link.hreflang, + href: link.url, + }) + ); + }); + + if (item.expires) { + tagParts.push( + element(TagNames.expires, new Date(item.expires).toISOString()) + ); + } - if (video.live) { - this.push(element(TagNames['video:live'], video.live)); - } + if (item.androidLink) { + tagParts.push( + element(TagNames['xhtml:link'], { + rel: 'alternate', + href: item.androidLink, + }) + ); + } - if (video.id) { - this.push(element(TagNames['video:id'], { type: 'url' }, video.id)); - } + if (item.ampLink) { + tagParts.push( + element(TagNames['xhtml:link'], { + rel: 'amphtml', + href: item.ampLink, + }) + ); + } - this.push(ctag(TagNames['video:video'])); - }); + if (item.news) { + tagParts.push(otag(TagNames['news:news'])); + tagParts.push(otag(TagNames['news:publication'])); + tagParts.push(element(TagNames['news:name'], item.news.publication.name)); - item.links.forEach((link) => { - this.push( - element(TagNames['xhtml:link'], { - rel: 'alternate', - hreflang: link.lang || link.hreflang, - href: link.url, - }) - ); - }); + tagParts.push( + element(TagNames['news:language'], item.news.publication.language) + ); + tagParts.push(ctag(TagNames['news:publication'])); - if (item.expires) { - this.push( - element(TagNames.expires, new Date(item.expires).toISOString()) - ); + if (item.news.access) { + tagParts.push(element(TagNames['news:access'], item.news.access)); } - if (item.androidLink) { - this.push( - element(TagNames['xhtml:link'], { - rel: 'alternate', - href: item.androidLink, - }) - ); + if (item.news.genres) { + tagParts.push(element(TagNames['news:genres'], item.news.genres)); } - if (item.ampLink) { - this.push( - element(TagNames['xhtml:link'], { - rel: 'amphtml', - href: item.ampLink, - }) - ); - } + tagParts.push( + element(TagNames['news:publication_date'], item.news.publication_date) + ); + tagParts.push(element(TagNames['news:title'], item.news.title)); - if (item.news) { - this.push(otag(TagNames['news:news'])); - this.push(otag(TagNames['news:publication'])); - this.push(element(TagNames['news:name'], item.news.publication.name)); + if (item.news.keywords) { + tagParts.push(element(TagNames['news:keywords'], item.news.keywords)); + } - this.push( - element(TagNames['news:language'], item.news.publication.language) + if (item.news.stock_tickers) { + tagParts.push( + element(TagNames['news:stock_tickers'], item.news.stock_tickers) ); - this.push(ctag(TagNames['news:publication'])); - - if (item.news.access) { - this.push(element(TagNames['news:access'], item.news.access)); - } + } + tagParts.push(ctag(TagNames['news:news'])); + } - if (item.news.genres) { - this.push(element(TagNames['news:genres'], item.news.genres)); - } + // Image handling + item.img.forEach((image): void => { + tagParts.push(otag(TagNames['image:image'])); + tagParts.push(element(TagNames['image:loc'], image.url)); - this.push( - element(TagNames['news:publication_date'], item.news.publication_date) - ); - this.push(element(TagNames['news:title'], item.news.title)); + if (image.caption) { + tagParts.push(element(TagNames['image:caption'], image.caption)); + } - if (item.news.keywords) { - this.push(element(TagNames['news:keywords'], item.news.keywords)); - } + if (image.geoLocation) { + tagParts.push(element(TagNames['image:geo_location'], image.geoLocation)); + } - if (item.news.stock_tickers) { - this.push( - element(TagNames['news:stock_tickers'], item.news.stock_tickers) - ); - } - this.push(ctag(TagNames['news:news'])); + if (image.title) { + tagParts.push(element(TagNames['image:title'], image.title)); } - // Image handling - item.img.forEach((image): void => { - this.push(otag(TagNames['image:image'])); - this.push(element(TagNames['image:loc'], image.url)); + if (image.license) { + tagParts.push(element(TagNames['image:license'], image.license)); + } - if (image.caption) { - this.push(element(TagNames['image:caption'], image.caption)); - } + tagParts.push(ctag(TagNames['image:image'])); + }); - if (image.geoLocation) { - this.push(element(TagNames['image:geo_location'], image.geoLocation)); - } + tagParts.push(ctag(TagNames.url)); - if (image.title) { - this.push(element(TagNames['image:title'], image.title)); - } + return tagParts.join(''); +} - if (image.license) { - this.push(element(TagNames['image:license'], image.license)); - } +export interface SitemapItemStreamOptions extends TransformOptions { + level?: ErrorLevel; +} - this.push(ctag(TagNames['image:image'])); - }); +/** + * Takes a stream of SitemapItemOptions and spits out xml for each + * @example + * // writes https://example.comhttps://example.com/2 + * const smis = new SitemapItemStream({level: 'warn'}) + * smis.pipe(writestream) + * smis.write({url: 'https://example.com', img: [], video: [], links: []}) + * smis.write({url: 'https://example.com/2', img: [], video: [], links: []}) + * smis.end() + * @param level - Error level + */ +export class SitemapItemStream extends Transform { + level: ErrorLevel; + constructor(opts: SitemapItemStreamOptions = { level: ErrorLevel.WARN }) { + opts.objectMode = true; + super(opts); + this.level = opts.level || ErrorLevel.WARN; + } - this.push(ctag(TagNames.url)); + _transform( + item: SitemapItem, + encoding: string, + callback: TransformCallback + ): void { + this.push(SitemapItemToXMLString(item)); callback(); } } diff --git a/lib/sitemap-stream.ts b/lib/sitemap-stream.ts index 00e317b4..f014d40a 100644 --- a/lib/sitemap-stream.ts +++ b/lib/sitemap-stream.ts @@ -7,7 +7,7 @@ import { } from 'stream'; import { SitemapItemLoose, ErrorLevel, ErrorHandler } from './types'; import { validateSMIOptions, normalizeURL } from './utils'; -import { SitemapItemStream } from './sitemap-item-stream'; +import { SitemapItemToXMLString } from './sitemap-item-stream'; import { EmptyStream, EmptySitemap } from './errors'; const xmlDec = ''; @@ -60,6 +60,26 @@ const getURLSetNs: (opts: NSArgs, xslURL?: string) => string = ( export const closetag = ''; export interface SitemapStreamOptions extends TransformOptions { + /** + * Byte limit to allow in the sitemap + * + * Sitemaps are supposed to be 50 MB or less in total size + * + * Writing throws if count would be exceeded by the write + * + * @default unlimited + */ + byteLimit?: number; + /** + * Count of items to allow in the sitemap + * + * Sitemaps are supposed to have 50,000 or less items + * + * Writing throws if count would be exceeded by the write + * + * @default unlimited + */ + countLimit?: number; hostname?: string; level?: ErrorLevel; lastmodDateOnly?: boolean; @@ -84,14 +104,19 @@ const defaultStreamOpts: SitemapStreamOptions = { * Sitemap. The readable stream it transforms **must** be in object mode. */ export class SitemapStream extends Transform { - hostname?: string; - level: ErrorLevel; - hasHeadOutput: boolean; - xmlNS: NSArgs; - xslUrl?: string; - errorHandler?: ErrorHandler; - private smiStream: SitemapItemStream; - lastmodDateOnly: boolean; + private byteLimit?: number; + private countLimit?: number; + private hostname?: string; + private level: ErrorLevel; + private hasHeadOutput: boolean; + private xmlNS: NSArgs; + private xslUrl?: string; + private errorHandler?: ErrorHandler; + private lastmodDateOnly: boolean; + private _itemCount: number; + private _byteCount: number; + private _wroteCloseTag: boolean; + constructor(opts = defaultStreamOpts) { opts.objectMode = true; super(opts); @@ -99,11 +124,26 @@ export class SitemapStream extends Transform { this.hostname = opts.hostname; this.level = opts.level || ErrorLevel.WARN; this.errorHandler = opts.errorHandler; - this.smiStream = new SitemapItemStream({ level: opts.level }); - this.smiStream.on('data', (data) => this.push(data)); this.lastmodDateOnly = opts.lastmodDateOnly || false; this.xmlNS = opts.xmlns || defaultXMLNS; this.xslUrl = opts.xslUrl; + this.byteLimit = opts.byteLimit; + this.countLimit = opts.countLimit; + this._byteCount = 0; + this._itemCount = 0; + this._wroteCloseTag = false; + } + + public get byteCount(): number { + return this._byteCount; + } + + public get itemCount(): number { + return this._itemCount; + } + + public get wroteCloseTag(): boolean { + return this._wroteCloseTag; } _transform( @@ -112,23 +152,67 @@ export class SitemapStream extends Transform { callback: TransformCallback ): void { if (!this.hasHeadOutput) { + // Add the opening tag size and closing tag size (since we have to close) this.hasHeadOutput = true; - this.push(getURLSetNs(this.xmlNS, this.xslUrl)); + const headOutput = getURLSetNs(this.xmlNS, this.xslUrl); + this._byteCount += headOutput.length + closetag.length; + this.push(headOutput); + } + + // Reject if item limit would be exceeded + if (this.countLimit !== undefined && this._itemCount === this.countLimit) { + // Write the close tag as the stream will be ended by raising an error + this.push(closetag); + this._wroteCloseTag = true; + + callback( + new Error( + 'Item count limit would be exceeded, not writing, stream will close' + ) + ); + return; } - this.smiStream.write( + + const itemOutput = SitemapItemToXMLString( validateSMIOptions( normalizeURL(item, this.hostname, this.lastmodDateOnly), this.level, this.errorHandler ) ); + + // Check if the size would be exceeded by the new item + // and throw if it would exceed (when size limit enabled) + if (this.byteLimit !== undefined) { + if (this._byteCount + itemOutput.length > this.byteLimit) { + // Write the close tag as the stream will be ended by raising an error + this.push(closetag); + this._wroteCloseTag = true; + + callback( + new Error( + 'Byte count limit would be exceeded, not writing, stream will close' + ) + ); + return; + } + } + + this.push(itemOutput); + this._byteCount += itemOutput.length; + this._itemCount += 1; + callback(); } _flush(cb: TransformCallback): void { - if (!this.hasHeadOutput) { + if (this._wroteCloseTag) { + cb(); + } else if (!this.hasHeadOutput) { + this._wroteCloseTag = true; cb(new EmptySitemap()); } else { + this._wroteCloseTag = true; this.push(closetag); cb(); } @@ -151,7 +235,7 @@ export function streamToPromise(stream: Readable): Promise { }, }) ) - .on('error', reject) + .on('error', () => reject) .on('finish', () => { if (!drain.length) { reject(new EmptyStream()); diff --git a/tests/sitemap-stream.test.ts b/tests/sitemap-stream.test.ts index 2de9e062..b78be0e3 100644 --- a/tests/sitemap-stream.test.ts +++ b/tests/sitemap-stream.test.ts @@ -1,9 +1,14 @@ +import { promisify } from 'util'; +import { finished, pipeline, Writable } from 'stream'; + import { SitemapStream, closetag, streamToPromise, } from '../lib/sitemap-stream'; +const finishedAsync = promisify(finished); + const minimumns = ' { sms.write(sampleURLs[0]); sms.write(sampleURLs[1]); sms.end(); - expect((await streamToPromise(sms)).toString()).toBe( + expect(sms.itemCount).toBe(2); + const outputStr = (await streamToPromise(sms)).toString(); + expect(sms.byteCount).toBe(outputStr.length); + expect(outputStr).toBe( preamble + `${sampleURLs[0]}/` + `${sampleURLs[1]}` + @@ -27,6 +35,129 @@ describe('sitemap stream', () => { ); }); + it('emits error on item count would be exceeded', async () => { + const sms = new SitemapStream({ countLimit: 1 }); + const drain = []; + const sink = new Writable({ + write(chunk, enc, next): void { + drain.push(chunk); + next(); + }, + }); + + const pipelineCallback = jest.fn(); + + pipeline(sms, sink, pipelineCallback); + + const writeAsync = ( + chunk: any, + encoding?: BufferEncoding + ): Promise => { + return new Promise((resolve, reject) => { + const writeReturned = sms.write(chunk, encoding, (error) => { + if (error !== undefined) { + reject(error); + } else { + resolve(writeReturned); + } + }); + }); + }; + + // This write will succeed + await writeAsync(sampleURLs[0]); + expect(sms.itemCount).toBe(1); + expect(sms.wroteCloseTag).toBe(false); + + // This write will fail + await expect(() => writeAsync(sampleURLs[1])).rejects.toThrow( + 'Item count limit would be exceeded, not writing, stream will close' + ); + + // Node 12 hangs on this await, Node 14 fixes it + if (process.version.split('.')[0] !== 'v12') { + // This is the signal that the file was closed correctly + expect(sms.wroteCloseTag).toBe(true); + expect(sms.destroyed).toBe(true); + + await finishedAsync(sms); + } + + // Note: we cannot use streamToPromise here because + // it just hangs in this case - That's probably a problem to fix. + // const outputStr = (await streamToPromise(sms)).toString(); + + // Closing should generate a valid file with contents + // from before the exception + const outputStr = Buffer.concat(drain).toString(); + + expect(outputStr).toBe( + preamble + `${sampleURLs[0]}/` + closetag + ); + expect(sms.byteCount).toBe(outputStr.length); + + expect(pipelineCallback).toBeCalledTimes(1); + }); + + it('throws on byte count would be exceeded', async () => { + const drain = []; + const sink = new Writable({ + write(chunk, enc, next): void { + drain.push(chunk); + next(); + }, + }); + + const pipelineCallback = jest.fn(); + + const sms = new SitemapStream({ byteLimit: 400 }); + + pipeline(sms, sink, pipelineCallback); + + const writeAsync = ( + chunk: any, + encoding?: BufferEncoding + ): Promise => { + return new Promise((resolve, reject) => { + const writeReturned = sms.write(chunk, encoding, (error) => { + if (error !== undefined) { + reject(error); + } else { + resolve(writeReturned); + } + }); + }); + }; + + // This write will succeed + await writeAsync(sampleURLs[0]); + expect(sms.itemCount).toBe(1); + expect(sms.byteCount).toBe(375); + expect(sms.wroteCloseTag).toBe(false); + + await expect(() => writeAsync(sampleURLs[1])).rejects.toThrow( + 'Byte count limit would be exceeded, not writing, stream will close' + ); + + expect(sms.wroteCloseTag).toBe(true); + expect(sms.destroyed).toBe(true); + + // Node 12 hangs on this await, Node 14 fixes it + if (process.version.split('.')[0] !== 'v12') { + await finishedAsync(sms); + } + + // Closing should generate a valid file after the exception + const outputStr = Buffer.concat(drain).toString(); + + expect(sms.byteCount).toBe(outputStr.length); + expect(outputStr).toBe( + preamble + `${sampleURLs[0]}/` + closetag + ); + + expect(pipelineCallback).toBeCalledTimes(1); + }); + it('pops out custom xmlns', async () => { const sms = new SitemapStream({ xmlns: { @@ -43,7 +174,7 @@ describe('sitemap stream', () => { sms.write(sampleURLs[0]); sms.write(sampleURLs[1]); sms.end(); - expect((await streamToPromise(sms)).toString()).toBe( + await expect((await streamToPromise(sms)).toString()).toBe( minimumns + xhtml + image +