-
Notifications
You must be signed in to change notification settings - Fork 18
/
mimeparser.js
1060 lines (989 loc) · 44.5 KB
/
mimeparser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
////////////////////////////////////////////////////////////////////////////////
// JavaScript Raw MIME Parser //
////////////////////////////////////////////////////////////////////////////////
/**
* The parser implemented in this file produces a MIME part tree for a given
* input message via a streaming callback interface. It does not, by itself,
* understand concepts like attachments (hence the term 'Raw'); the consumer
* must translate output into such a format.
*
* Charsets:
* The MIME specifications permit a single message to contain multiple charsets
* (or perhaps none) as raw octets. As JavaScript strings are implicitly
* implemented in UTF-16, it is possible that some engines will attempt to
* convert these strings using an incorrect charset or simply fail to convert
* them at all. This parser assumes that its input is in the form of a "binary
* string", a string that uses only the first 256 characters of Unicode to
* represent the individual octets. To verify that charsets are not getting
* mangled elsewhere in the pipeline, the auxiliary test file test/data/charsets
* can be used.
*
* This parser attempts to hide the charset details from clients as much as
* possible. The resulting values of structured headers are always converted
* into proper Unicode strings before being exposed to clients; getting at the
* raw binary string data can only be done via getRawHeader. The .charset
* parameter on header objects, if changed, changes the fallback charset used
* for headers. It is initialized to the presumed charset of the corresponding
* part, taking into account the charset and force-charset options of the
* parser. Body parts are only converted into Unicode strings if the strformat
* option is set to Unicode. Even then, only the bodies of parts with a media
* type of text are converted to Unicode strings using available charset data;
* other parts are retained as Uint8Array objects.
*
* Part numbering:
* Since the output is a streaming format, individual parts are identified by a
* numbering scheme. The intent of the numbering scheme for parts is to comply
* with the part numbers as dictated by RFC 3501 as much possible; however,
* that scheme does have several edge cases which would, if strictly followed,
* make it impossible to refer to certain parts of the message. In addition, we
* wish to make it possible to refer to parts which are not discoverable in the
* original MIME tree but are still viewable as parts. The part numbering
* scheme is as follows:
* - Individual sections of a multipart/* body are numbered in increasing order
* sequentially, starting from 1. Note that the prologue and the epilogue of
* a multipart/* body are not considered entities and are therefore not
* included in the part numbering scheme (there is no way to refer to them).
* - The numbers of multipart/* parts are separated by `.' characters.
* - The outermost message is referred to by use of the empty string.
* --> The following segments are not accounted for by IMAP part numbering. <--
* - The body of any message/rfc822 or similar part is distinguished from the
* message part as a whole by appending a `$' character. This does not apply
* to the outermost message/rfc822 envelope.
*/
define(function(require) {
"use strict";
var mimeutils = require('./mimeutils');
var headerparser = require('./headerparser');
var spellings = require('./structuredHeaders').spellings;
/**
* An object that represents the structured MIME headers for a message.
*
* This class is primarily used as the 'headers' parameter in the startPart
* callback on handlers for MimeParser. As such, it is designed to do the right
* thing in common cases as much as possible, with some advanced customization
* possible for clients that need such flexibility.
*
* In a nutshell, this class stores the raw headers as an internal Map. The
* structured headers are not computed until they are actually used, which means
* that potentially expensive structuring (e.g., doing manual DKIM validation)
* can be performed as a structured decoder without impeding performance for
* those who just want a few common headers.
*
* The outer API of this class is intended to be similar to a read-only Map
* object (complete with iterability support), with a few extra properties to
* represent things that are hard to determine properly from headers. The keys
* used are "preferred spellings" of the headers, although the get and has
* methods will accept header parameters of any case. Preferred spellings are
* derived from the name passed to addStructuredDecoder/addStructuredEncoder; if
* no structured decoder has been registered, then the name capitalizes the
* first letter of every word in the header name.
*
* Extra properties compared to a Map object are:
* - charset: This field represents the assumed charset of the associated MIME
* body. It is prefilled using a combination of the charset and force-charset
* options on the associated MimeParser instance as well as attempting to find
* a charset parameter in the Content-Type header.
*
* If the force-charset option is false, the charset is guessed first using
* the Content-Type header's charset parameter, falling back to the charset
* option if it is present. If the force-charset option is true, the charset
* is initially set to the charset option. This initial guessed value can be
* overridden at any time by simply setting the field on this object.
*
* The charset is better reflected as a parameter of the body rather than the
* headers; this is ultimately the charset parameter that will be used if a
* body part is being converted to a Unicode strformat. Headers are converted
* using headerparser.convert8BitHeader, and this field is used as the
* fallbackCharset parameter, which will always to attempt to decode as UTF-8
* first (in accordance with RFC 6532) and will refuse to decode as UTF-16 or
* UTF-32, as ASCII is not a subset of those charsets.
*
* - rawHeaderText: This read-only field contains the original header text from
* which headers were parsed, preserving case and whitespace (including
* alternate line endings instead of CRLF) exactly. If the header text begins
* with the mbox delimiter (i.e., a line that begins with "From "), then that
* is excluded from the rawHeaderText value and is not reflected anywhere in
* this object.
*
* - contentType: This field contains the structured representation of the
* Content-Type header, if it is present. If it is not present, it is set to
* the structured representation of the default Content-Type for a part (as
* this data is not easily guessed given only MIME tree events).
*
* The constructor for these objects is not externally exported, and thus they
* can only be created via MimeParser.
*
* @param rawHeaderText {BinaryString} The contents of the MIME headers to be
* parsed.
* @param options {Object} Options for the header parser.
* @param options.stripcontinuations {Boolean} If true, elide CRLFs from the
* raw header output.
*/
function StructuredHeaders(rawHeaderText, options) {
// An individual header is terminated by a CRLF, except if the CRLF is
// followed by a SP or TAB. Use negative lookahead to capture the latter case,
// and don't capture the strings or else split results get nasty.
let values = rawHeaderText.split(/(?:\r\n|\n)(?![ \t])|\r(?![ \t\n])/);
// Ignore the first "header" if it begins with an mbox delimiter
if (values.length > 0 && values[0].substring(0, 5) == "From ") {
values.shift();
// Elide the mbox delimiter from this._headerData
if (values.length == 0)
rawHeaderText = '';
else
rawHeaderText = rawHeaderText.substring(rawHeaderText.indexOf(values[0]));
}
let headers = new Map();
for (let i = 0; i < values.length; i++) {
// Look for a colon. If it's not present, this header line is malformed,
// perhaps by premature EOF or similar.
let colon = values[i].indexOf(":");
if (colon >= 0) {
var header = values[i].substring(0, colon);
var val = values[i].substring(colon + 1).trim();
if (options.stripcontinuations)
val = val.replace(/[\r\n]/g, '');
} else {
var header = values[i];
var val = '';
}
// Canonicalize the header in lower-case form.
header = header.trim().toLowerCase();
// Omit "empty" headers
if (header == '')
continue;
// We keep an array of values for each header, since a given header may be
// repeated multiple times.
if (headers.has(header)) {
headers.get(header).push(val);
} else {
headers.set(header, [val]);
}
}
/**
* A map of header names to arrays of raw values found in this header block.
* @private
*/
this._rawHeaders = headers;
/**
* Cached results of structured header parsing.
* @private
*/
this._cachedHeaders = new Map();
Object.defineProperty(this, "rawHeaderText",
{get: function () { return rawHeaderText; }});
Object.defineProperty(this, "size",
{get: function () { return this._rawHeaders.size; }});
Object.defineProperty(this, "charset", {
get: function () { return this._charset; },
set: function (value) {
this._charset = value;
// Clear the cached headers, since this could change their values
this._cachedHeaders.clear();
}
});
// Default to the charset, until the message parser overrides us.
if ('charset' in options)
this._charset = options.charset;
else
this._charset = null;
// If we have a Content-Type header, set contentType to return the structured
// representation. We don't set the value off the bat, since we want to let
// someone who changes the charset affect the values of 8-bit parameters.
Object.defineProperty(this, "contentType", {
configurable: true,
get: function () { return this.get('Content-Type'); }
});
}
/**
* Get a raw header.
*
* Raw headers are an array of the header values, listed in order that they were
* specified in the header block, and without any attempt to convert charsets or
* apply RFC 2047 decoding. For example, in the following message (where the
* <XX> is meant to represent binary-octets):
*
* X-Header: Value A
* X-Header: V<C3><A5>lue B
* Header2: Q
*
* the result of calling getRawHeader('X-Header') or getRawHeader('x-header')
* would be ['Value A', 'V\xC3\xA5lue B'] and the result of
* getRawHeader('Header2') would be ['Q'].
*
* @param headerName {String} The header name for which to get header values.
* @returns {BinaryString[]} The raw header values (with no charset conversion
* applied).
*/
StructuredHeaders.prototype.getRawHeader = function (headerName) {
return this._rawHeaders.get(headerName.toLowerCase());
};
/**
* Retrieve a structured version of the header.
*
* If there is a registered structured decoder (registration happens via
* headerparser.addStructuredDecoder), then the result of calling that decoder
* on the charset-corrected version of the header is returned. Otherwise, the
* values are charset-corrected and RFC 2047 decoding is applied as if the
* header were an unstructured header.
*
* A substantial set of headers have pre-registed structured decoders, which, in
* some cases, are unable to be overridden due to their importance in the
* functioning of the parser code itself.
*
* @param headerName {String} The header name for which to get the header value.
* @returns The structured header value of the output.
*/
StructuredHeaders.prototype.get = function (headerName) {
// Normalize the header name to lower case
headerName = headerName.toLowerCase();
// First, check the cache for the header value
if (this._cachedHeaders.has(headerName))
return this._cachedHeaders.get(headerName);
// Not cached? Grab it [propagating lack of header to caller]
let headerValue = this._rawHeaders.get(headerName);
if (headerValue === undefined)
return headerValue;
// Convert the header to Unicode
let charset = this.charset;
headerValue = headerValue.map(function (value) {
return headerparser.convert8BitHeader(value, charset);
});
// If there is a structured decoder, use that; otherwise, assume that the
// header is unstructured and only do RFC 2047 conversion
let structured;
try {
structured = headerparser.parseStructuredHeader(headerName, headerValue);
} catch (e) {
structured = headerValue.map(function (value) {
return headerparser.decodeRFC2047Words(value);
});
}
// Cache the result and return it
this._cachedHeaders.set(headerName, structured);
return structured;
};
/**
* Check if the message has the given header.
*
* @param headerName {String} The header name for which to get the header value.
* @returns {Boolean} True if the header is present in this header block.
*/
StructuredHeaders.prototype.has = function (headerName) {
// Check for presence in the raw headers instead of cached headers.
return this._rawHeaders.has(headerName.toLowerCase());
};
// Make a custom iterator. Presently, support for Symbol isn't yet present in
// SpiderMonkey (or V8 for that matter), so type-pun the name for now.
const JS_HAS_SYMBOLS = typeof Symbol === "function";
const ITERATOR_SYMBOL = JS_HAS_SYMBOLS ? Symbol.iterator : "@@iterator";
/**
* An equivalent of Map.@@iterator, applied to the structured header
* representations. This is the function that makes
* for (let [header, value] of headers) work properly.
*/
StructuredHeaders.prototype[ITERATOR_SYMBOL] = function*() {
// Iterate over all the raw headers, and use the cached headers to retrieve
// them.
for (let headerName of this.keys()) {
yield [headerName, this.get(headerName)];
}
};
/**
* An equivalent of Map.forEach, applied to the structured header
* representations.
*
* @param callback {Function(value, name, headers)} The callback to call for
* each header/value combo.
* @param thisarg {Object} The parameter that will be
* the |this| of the callback.
*/
StructuredHeaders.prototype.forEach = function (callback, thisarg) {
for (let [header, value] of this) {
callback.call(thisarg, value, header, this);
}
};
/**
* An equivalent of Map.entries, applied to the structured header
* representations.
*/
StructuredHeaders.prototype.entries =
StructuredHeaders.prototype[ITERATOR_SYMBOL];
/// This function maps lower case names to a pseudo-preferred spelling.
function capitalize(headerName) {
return headerName.replace(/\b[a-z]/g, function (match) {
return match.toUpperCase();
});
}
/**
* An equivalent of Map.keys, applied to the structured header representations.
*/
StructuredHeaders.prototype.keys = function*() {
for (let name of this._rawHeaders.keys()) {
yield spellings.get(name) || capitalize(name);
}
};
/**
* An equivalent of Map.values, applied to the structured header
* representations.
*/
StructuredHeaders.prototype.values = function* () {
for (let [, value] of this) {
yield value;
}
};
/**
* A MIME parser.
*
* The inputs to the constructor consist of a callback object which receives
* information about the output data and an optional object containing the
* settings for the parser.
*
* The first parameter, emitter, is an object which contains several callbacks.
* Note that any and all of these methods are optional; the parser will not
* crash if one is missing. The callbacks are as follows:
* startMessage()
* Called when the stream to be parsed has started delivering data. This
* will be called exactly once, before any other call.
* endMessage()
* Called after all data has been delivered and the message parsing has
* been completed. This will be called exactly once, after any other call.
* startPart(string partNum, object headers)
* Called after the headers for a body part (including the top-level
* message) have been parsed. The first parameter is the part number (see
* the discussion on part numbering). The second parameter is an instance
* of StructuredHeaders that represents all of the headers for the part.
* endPart(string partNum)
* Called after all of the data for a body part (including sub-parts) has
* been parsed. The first parameter is the part number.
* deliverPartData(string partNum, {string,typedarray} data)
* Called when some data for a body part has been delivered. The first
* parameter is the part number. The second parameter is the data which is
* being delivered; the exact type of this data depends on the options
* used. Note that data is only delivered for leaf body parts.
*
* The second parameter, options, is an optional object containing the options
* for the parser. The following are the options that the parser may use:
* pruneat: <string> [default=""]
* Treat the message as starting at the given part number, so that no parts
* above <string> are returned.
* bodyformat: one of {none, raw, nodecode, decode} [default=nodecode]
* How to return the bodies of parts:
* none: no part data is returned
* raw: the body of the part is passed through raw
* nodecode: the body is passed through without decoding QP/Base64
* decode: quoted-printable and base64 are fully decoded
* strformat: one of {binarystring, unicode, typedarray} [default=binarystring]
* How to treat output strings:
* binarystring: Data is a JS string with chars in the range [\x00-\xff]
* unicode: Data for text parts is converted to UTF-16; data for other
* parts is a typed array buffer, akin to typedarray.
* typedarray: Data is a JS typed array buffer
* charset: <string> [default=""]
* What charset to assume if no charset information is explicitly provided.
* This only matters if strformat is unicode. See above note on charsets
* for more details.
* force-charset: <boolean> [default=false]
* If true, this coerces all types to use the charset option, even if the
* message specifies a different content-type.
* stripcontinuations: <boolean> [default=true]
* If true, then the newlines in headers are removed in the returned
* header objects.
* onerror: <function(thrown error)> [default = nop-function]
* An error function that is called if an emitter callback throws an error.
* By default, such errors are swallowed by the parser. If you want the
* parser itself to throw an error, rethrow it via the onerror function.
*/
function MimeParser(emitter, options) {
/// The actual emitter
this._emitter = emitter;
/// Options for the parser (those listed here are defaults)
this._options = {
pruneat: "",
bodyformat: "nodecode",
strformat: "binarystring",
stripcontinuations: true,
charset: "",
"force-charset": false,
onerror: function swallow(error) {}
};
// Load the options as a copy here (prevents people from changing on the fly).
if (options)
for (var opt in options) {
this._options[opt] = options[opt];
}
// Ensure that the error function is in fact a function
if (typeof this._options.onerror != "function")
throw new Exception("onerror callback must be a function");
// Reset the parser
this.resetParser();
}
/**
* Resets the parser to read a new message. This method need not be called
* immediately after construction.
*/
MimeParser.prototype.resetParser = function () {
/// Current parser state
this._state = PARSING_HEADERS;
/// Input data that needs to be held for buffer conditioning
this._holdData = '';
/// Complete collection of headers (also used to accumulate _headerData)
this._headerData = '';
/// Whether or not emitter.startMessage has been called
this._triggeredCall = false;
/// Splitting input
this._splitRegex = this._handleSplit = undefined;
/// Subparsing
this._subparser = this._subPartNum = undefined;
/// Data that has yet to be consumed by _convertData
this._savedBuffer = '';
/// Convert data
this._convertData = undefined;
/// String decoder
this._decoder = undefined;
};
/**
* Deliver a buffer of data to the parser.
*
* @param buffer {BinaryString} The raw data to add to the message.
*/
MimeParser.prototype.deliverData = function (buffer) {
// In ideal circumstances, we'd like to parse the message all at once. In
// reality, though, data will be coming to us in packets. To keep the amount
// of saved state low, we want to make basic guarantees about how packets get
// delivered. Our basic model is a twist on line-buffering, as the format of
// MIME and messages make it hard to not do so: we can handle multiple lines
// at once. To ensure this, we start by conditioning the packet by
// withholding data to make sure that the internal deliveries have the
// guarantees. This implies that we need to do the following steps:
// 1. We don't know if a `\r' comes from `\r\n' or the old mac line ending
// until we see the next character. So withhold the last `\r'.
// 2. Ensure that every packet ends on a newline. So scan for the end of the
// line and withhold until the \r\n comes through.
// [Note that this means that an input message that uses \r line endings and
// is being passed to us via a line-buffered input is going to have most of
// its data being withhold until the next buffer. Since \r is so uncommon of
// a line ending in modern times, this is acceptable lossage.]
// 3. Eliminate empty packets.
// Add in previously saved data
if (this._holdData) {
buffer = this._holdData + buffer;
this._holdData = '';
}
// Condition the input, so that we get the multiline-buffering mentioned in
// the above comment.
if (buffer.length > 0) {
[buffer, this._holdData] = conditionToEndOnCRLF(buffer);
}
// Ignore 0-length buffers.
if (buffer.length == 0)
return;
// Signal the beginning, if we haven't done so.
if (!this._triggeredCall) {
this._callEmitter("startMessage");
this._triggeredCall = true;
}
// Finally, send it the internal parser.
this._dispatchData("", buffer, true);
}
/**
* Ensure that a set of data always ends in an end-of-line character.
*
* @param buffer {BinaryString} The data with no guarantees about where it ends.
* @returns {BinaryString[]} An array of 2 binary strings where the first string
* ends in a newline and the last string contains the
* text in buffer following the first string.
*/
function conditionToEndOnCRLF(buffer) {
// Find the last occurrence of '\r' or '\n' to split the string. However, we
// don't want to consider '\r' if it is the very last character, as we need
// the next packet to tell if the '\r' is the beginning of a CRLF or a line
// ending by itself.
let lastCR = buffer.lastIndexOf('\r', buffer.length - 2);
let lastLF = buffer.lastIndexOf('\n');
let end = lastLF > lastCR ? lastLF : lastCR;
return [buffer.substring(0, end + 1), buffer.substring(end + 1)];
};
/**
* Tell the parser that all of the data has been delivered.
*
* This will flush all of the internal state of the parser.
*/
MimeParser.prototype.deliverEOF = function () {
// Start of input buffered too long? Call start message now.
if (!this._triggeredCall) {
this._triggeredCall = true;
this._callEmitter("startMessage");
}
// Force a flush of all of the data.
if (this._holdData)
this._dispatchData("", this._holdData, true);
this._dispatchEOF("");
// Signal to the emitter that we're done.
this._callEmitter("endMessage");
};
/**
* Calls a method on the emitter safely.
*
* This method ensures that errors in the emitter call won't cause the parser
* to exit with an error, unless the user wants it to.
*
* @param funcname {String} The function name to call on the emitter.
* @param args... Extra arguments to pass into the emitter callback.
*/
MimeParser.prototype._callEmitter = function (funcname) {
if (this._emitter && funcname in this._emitter) {
let args = Array.prototype.splice.call(arguments, 1);
if (args.length > 0 && this._willIgnorePart(args[0])) {
// partNum is always the first argument, so check to make sure that it
// satisfies our emitter's pruneat requirement.
return;
}
try {
this._emitter[funcname].apply(this._emitter, args);
} catch (e) {
// We ensure that the onerror attribute in options is a function, so this
// is always safe.
this._options.onerror(e);
}
}
};
/**
* Helper function to decide if a part's output will never be seen.
*
* @param part {String} The number of the part.
* @returns {Boolean} True if the emitter is not interested in this part.
*/
MimeParser.prototype._willIgnorePart = function (part) {
if (this._options["pruneat"]) {
let match = this._options["pruneat"];
let start = part.substr(0, match.length);
// It needs to start with and follow with a new part indicator
// (i.e., don't let 10 match with 1, but let 1.1 or 1$ do so)
if (start != match || (match.length < part.length &&
"$.".indexOf(part[match.length]) == -1))
return true;
}
return false;
};
//////////////////////
// MIME parser core //
//////////////////////
// This MIME parser is a stateful parser; handling of the MIME tree is mostly
// done by creating new parsers and feeding data to them manually. In parallel
// to the externally-visible deliverData and deliverEOF, the two methods
// _dispatchData and _dispatchEOF are the internal counterparts that do the
// main work of moving data to where it needs to go; helper functions are used
// to handle translation.
//
// The overall flow of the parser is this. First, it buffers all of the data
// until the dual-CRLF pattern is noticed. Once that is found, it parses the
// entire header chunk at once. As a result of header parsing, the parser enters
// one of three modes for handling data, and uses a special regex to change
// modes and handle state changes. Specific details about the states the parser
// can be in are as follows:
// PARSING_HEADERS: The input buffer is concatenated to the currently-received
// text, which is then searched for the CRLFCRLF pattern. If found, the data
// is split at this boundary; the first chunk is parsed using _parseHeaders,
// and the second chunk will fall through to buffer processing. After
// splitting, the headers are deliverd via the emitter, and _startBody is
// called to set up state for the parser.
// SEND_TO_BLACK_HOLE: All data in the input is ignored.
// SEND_TO_EMITTER: All data is passed into the emitter, if it is desired.
// Data can be optionally converted with this._convertData.
// SEND_TO_SUBPARSER: All data is passed into the subparser's _dispatchData
// method, using _subPartNum as the part number and _subparser as the object
// to call. Data can be optionally converted first with this._convertData.
//
// Additional state modifications can be done using a regex in _splitRegex and
// the callback method this._handleSplit(partNum, regexResult). The _handleSplit
// callback is free to do any modification to the current parser, including
// modifying the _splitRegex value. Packet conditioning guarantees that every
// buffer string passed into _dispatchData will have started immediately after a
// newline character in the fully assembled message.
//
// The this._convertData method, if present, is expected to return an array of
// two values, [{typedarray, string} decoded_buffer, string unused_buffer], and
// has as its arguments (string buffer, bool moreToCome).
//
// The header parsing by itself does very little parsing, only parsing as if all
// headers were unstructured fields. Values are munged so that embedded newlines
// are stripped and the result is also trimmed. Headers themselves are
// canonicalized into lower-case.
// Parser states. See the large comment above.
const PARSING_HEADERS = 1;
const SEND_TO_BLACK_HOLE = 2;
const SEND_TO_EMITTER = 3;
const SEND_TO_SUBPARSER = 4;
/**
* Main dispatch for incoming packet data.
*
* The incoming data needs to have been sanitized so that each packet begins on
* a newline boundary. The part number for the current parser also needs to be
* passed in. The checkSplit parameter controls whether or not the data in
* buffer needs to be checked against _splitRegex; this is used internally for
* the mechanics of splitting and should otherwise always be true.
*
* @param partNum {String} The part number being currently parsed.
* @param buffer {BinaryString} The text (conditioned as mentioned above) to
* pass to the parser.
* @param checkSplit {Boolean} If true, split the text using _splitRegex.
* This is set to false internally to handle
* low-level splitting details.
*/
MimeParser.prototype._dispatchData = function (partNum, buffer, checkSplit) {
// Are we parsing headers?
if (this._state == PARSING_HEADERS) {
this._headerData += buffer;
// Find the end of the headers--either it's a CRLF at the beginning (in
// which case we have no headers), or it's a pair of CRLFs.
let result = /(?:^(?:\r\n|[\r\n]))|(\r\n|[\r\n])\1/.exec(this._headerData);
if (result != null) {
// If we found the end of headers, split the data at this point and send
// the stuff after the double-CRLF into the later body parsing.
let headers = this._headerData.substr(0, result.index);
buffer = this._headerData.substring(result.index + result[0].length);
this._headerData = headers;
this._headers = this._parseHeaders();
this._callEmitter("startPart", partNum, this._headers);
this._startBody(partNum);
} else {
return;
}
}
// We're in the middle of the body. Start by testing the split regex, to see
// if there are many things that need to be done.
if (checkSplit && this._splitRegex) {
let splitResult = this._splitRegex.exec(buffer);
if (splitResult) {
// Pass the text before the split through the current state.
let start = splitResult.index, len = splitResult[0].length;
if (start > 0)
this._dispatchData(partNum, buffer.substr(0, start), false);
// Tell the handler that we've seen the split. Note that this can change
// any method on `this'.
this._handleSplit(partNum, splitResult);
// Send the rest of the data to where it needs to go. There could be more
// splits in the data, so watch out!
buffer = buffer.substring(start + len);
if (buffer.length > 0)
this._dispatchData(partNum, buffer, true);
return;
}
}
// Where does the data go?
if (this._state == SEND_TO_BLACK_HOLE) {
// Don't send any data when going to the black hole.
return;
} else if (this._state == SEND_TO_EMITTER) {
// Don't pass body data if the format is to be none
let passData = this._options["bodyformat"] != "none";
if (!passData || this._willIgnorePart(partNum))
return;
buffer = this._applyDataConversion(buffer, this._options["strformat"]);
if (buffer.length > 0)
this._callEmitter("deliverPartData", partNum, buffer);
} else if (this._state == SEND_TO_SUBPARSER) {
buffer = this._applyDataConversion(buffer, "binarystring");
if (buffer.length > 0)
this._subparser._dispatchData(this._subPartNum, buffer, true);
}
};
/**
* Output data using the desired output format, saving data if data conversion
* needs extra data to be saved.
*
* @param buf {BinaryString} The data to be sent to the output.
* @param type {String} The type of the data to output. Valid values are
* the same as the strformat option.
* @returns Coerced and converted data that can be sent to the emitter or
* subparser.
*/
MimeParser.prototype._applyDataConversion = function (buf, type) {
// If we need to convert data, do so.
if (this._convertData) {
// Prepend leftover data from the last conversion.
buf = this._savedBuffer + buf;
[buf, this._savedBuffer] = this._convertData(buf, true);
}
return this._coerceData(buf, type, true);
};
/**
* Coerce the input buffer into the given output type.
*
* @param buffer {BinaryString|Uint8Array} The data to be converted.
* @param type {String} The type to convert the data to.
* @param more {boolean} If true, this function will never be
* called again.
* @returns {BinaryString|String|Uint8Array} The desired output format.
*/
/// Coerces the buffer (a string or typedarray) into a given type
MimeParser.prototype._coerceData = function (buffer, type, more) {
if (typeof buffer == "string") {
// string -> binarystring is a nop
if (type == "binarystring")
return buffer;
// Either we're going to array or unicode. Both people need the array
var typedarray = mimeutils.stringToTypedArray(buffer);
// If it's unicode, do the coercion from the array
// If its typedarray, just return the synthesized one
return type == "unicode" ? this._coerceData(typedarray, "unicode", more)
: typedarray;
} else if (type == "binarystring") {
// Doing array -> binarystring
return mimeutils.typedArrayToString(buffer);
} else if (type == "unicode") {
// Doing array-> unicode: Use the decoder set up earlier to convert
if (this._decoder)
return this._decoder.decode(buffer, {stream: more});
// If there is no charset, just return the typed array instead.
return buffer;
}
throw new Error("Invalid type: " + type);
};
/**
* Signal that no more data will be dispatched to this parser.
*
* @param partNum {String} The part number being currently parsed.
*/
MimeParser.prototype._dispatchEOF = function (partNum) {
if (this._state == PARSING_HEADERS) {
// Unexpected EOF in headers. Parse them now and call startPart/endPart
this._headers = this._parseHeaders();
this._callEmitter("startPart", partNum, this._headers);
} else if (this._state == SEND_TO_SUBPARSER) {
// Pass in any lingering data
if (this._convertData && this._savedBuffer)
this._subparser._dispatchData(this._subPartNum,
this._convertData(this._savedBuffer, false)[0], true);
this._subparser._dispatchEOF(this._subPartNum);
// Clean up after ourselves
this._subparser = null;
} else if (this._convertData && this._savedBuffer) {
// Convert lingering data
let [buffer, ] = this._convertData(this._savedBuffer, false);
buffer = this._coerceData(buffer, this._options["strformat"], false);
if (buffer.length > 0)
this._callEmitter("deliverPartData", partNum, buffer);
}
// We've reached EOF for this part; tell the emitter
this._callEmitter("endPart", partNum);
};
/**
* Produce a dictionary of all headers as if they were unstructured fields.
*
* @returns {StructuredHeaders} The structured header objects for the header
* block.
*/
MimeParser.prototype._parseHeaders = function () {
let headers = new StructuredHeaders(this._headerData, this._options);
// Fill the headers.contentType parameter of headers.
let contentType = headers.get('Content-Type');
if (typeof contentType === "undefined") {
contentType = headerparser.parseStructuredHeader('Content-Type',
this._defaultContentType || 'text/plain');
Object.defineProperty(headers, "contentType", {
get: function () { return contentType; }
});
} else {
Object.defineProperty(headers, "contentType", { configurable: false });
}
// Find the charset for the current part. If the user requested a forced
// conversion, use that first. Otherwise, check the content-type for one and
// fallback to a default if it is not present.
let charset = '';
if (this._options["force-charset"])
charset = this._options["charset"];
else if (contentType.has("charset"))
charset = contentType.get("charset");
else
charset = this._options["charset"];
headers.charset = charset;
// Retain a copy of the charset so that users don't override our decision for
// decoding body parts.
this._charset = charset;
return headers;
};
/**
* Initialize the parser state for the body of this message.
*
* @param partNum {String} The part number being currently parsed.
*/
MimeParser.prototype._startBody = function Parser_startBody(partNum) {
let contentType = this._headers.contentType;
// Should the bodyformat be raw, we just want to pass through all data without
// trying to interpret it.
if (this._options["bodyformat"] == "raw" &&
partNum == this._options["pruneat"]) {
this._state = SEND_TO_EMITTER;
return;
}
// The output depents on the content-type. Basic rule of thumb:
// 1. Discrete media types (text, video, audio, image, application) are passed
// through with no alterations beyond Content-Transfer-Encoding unpacking.
// 2. Everything with a media type of multipart is treated the same.
// 3. Any message/* type that acts like a mail message (rfc822, news, global)
// is parsed as a header/body pair again. Most of the other message/* types
// have similar structures, but they don't have cascading child subparts,
// so it's better to pass their entire contents to the emitter and let the
// consumer deal with them.
// 4. For untyped data, there needs to be no Content-Type header. This helps
// avoid false positives.
if (contentType.mediatype == 'multipart') {
// If there's no boundary type, everything will be part of the prologue of
// the multipart message, so just feed everything into a black hole.
if (!contentType.has('boundary')) {
this._state = SEND_TO_BLACK_HOLE;
return;
}
// The boundary of a multipart message needs to start with -- and be at the
// beginning of the line. If -- is after the boundary, it represents the
// terminator of the multipart. After the line, there may be only whitespace
// and then the CRLF at the end. Since the CRLFs in here are necessary for
// distinguishing the parts, they are not included in the subparts, so we
// need to capture them in the regex as well to prevent them leaking out.
this._splitRegex = new RegExp('(\r\n|[\r\n]|^)--' +
contentType.get('boundary').replace(/[\\^$*+?.()|{}[\]]/g, '\\$&') +
'(--)?[ \t]*(?:\r\n|[\r\n]|$)');
this._handleSplit = this._whenMultipart;
this._subparser = new MimeParser(this._emitter, this._options);
// multipart/digest defaults to message/rfc822 instead of text/plain
if (contentType.subtype == "digest")
this._subparser._defaultContentType = "message/rfc822";
// All text before the first boundary and after the closing boundary are
// supposed to be ignored ("must be ignored", according to RFC 2046 §5.1.1);
// in accordance with these wishes, ensure they don't get passed to any
// deliverPartData.
this._state = SEND_TO_BLACK_HOLE;
// Multipart MIME messages stipulate that the final CRLF before the boundary
// delimiter is not matched. When the packet ends on a CRLF, we don't know
// if the next text could be the boundary. Therefore, we need to withhold
// the last line of text to be sure of what's going on. The _convertData is
// how we do this, even though we're not really converting any data.
this._convertData = function mpart_no_leak_crlf(buffer, more) {
let splitPoint = buffer.length;
if (more) {
if (buffer.charAt(splitPoint - 1) == '\n')
splitPoint--;
if (splitPoint >= 0 && buffer.charAt(splitPoint - 1) == '\r')
splitPoint--;
}
let res = conditionToEndOnCRLF(buffer.substring(0, splitPoint));
let preLF = res[0];
let rest = res[1];
return [preLF, rest + buffer.substring(splitPoint)];
}
} else if (contentType.type == 'message/rfc822' ||
contentType.type == 'message/global' ||
contentType.type == 'message/news') {
// The subpart is just another header/body pair that goes to EOF, so just
// return the parse from that blob
this._state = SEND_TO_SUBPARSER;
this._subPartNum = partNum + "$";
this._subparser = new MimeParser(this._emitter, this._options);
// So, RFC 6532 happily allows message/global types to have CTE applied.
// This means that subparts would need to be decoded to determine their
// contents properly. There seems to be some evidence that message/rfc822
// that is illegally-encoded exists in the wild, so be lenient and decode
// for any message/* type that gets here.
let cte = this._extractHeader('content-transfer-encoding', '');
if (cte in ContentDecoders)
this._convertData = ContentDecoders[cte];
} else {
// Okay, we just have to feed the data into the output
this._state = SEND_TO_EMITTER;
if (this._options["bodyformat"] == "decode") {
// If we wish to decode, look it up in one of our decoders.
let cte = this._extractHeader('content-transfer-encoding', '');
if (cte in ContentDecoders)
this._convertData = ContentDecoders[cte];
}
}
// Set up the encoder for charset conversions; only do this for text parts.
// Other parts are almost certainly binary, so no translation should be
// applied to them.
if (this._options["strformat"] == "unicode" &&
contentType.mediatype == "text") {
// If the charset is nonempty, initialize the decoder
if (this._charset !== "") {
this._decoder = new TextDecoder(this._charset);
} else {
// There's no charset we can use for decoding, so pass through as an
// identity encoder or otherwise this._coerceData will complain.
this._decoder = {
decode: function identity_decoder(buffer) {
return MimeParser.prototype._coerceData(buffer, "binarystring", true);
}
};
}
} else {
this._decoder = null;
}
};
// Internal split handling for multipart messages.
/**
* When a multipary boundary is found, handle the process of managing the
* subparser state. This is meant to be used as a value for this._handleSplit.
*
* @param partNum {String} The part number being currently parsed.
* @param lastResult {Array} The result of the regular expression match.
*/
MimeParser.prototype._whenMultipart = function (partNum, lastResult) {
// Fix up the part number (don't do '' -> '.4' and don't do '1' -> '14')
if (partNum != "") partNum += ".";