mimeParse.js 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. /**
  2. * MIME Source Parser for Attachment Extraction
  3. *
  4. * Parses raw email MIME source to extract attachment metadata and content.
  5. * Used as a fallback when AppleScript's `mail attachments` returns empty
  6. * (which happens across all account types: iCloud, Google, Exchange).
  7. *
  8. * @module utils/mimeParse
  9. */
  10. /**
  11. * Extract the boundary string from a Content-Type header value
  12. * (or from any string containing a boundary= parameter).
  13. */
  14. function extractBoundary(source) {
  15. const match = source.match(/boundary="?([^";\s\r\n]+)"?/i);
  16. return match ? match[1] : null;
  17. }
  18. /**
  19. * Extract a header value from a MIME part header block.
  20. * Handles folded headers (continuation lines starting with whitespace).
  21. */
  22. function getHeader(headers, name) {
  23. const regex = new RegExp(`^${name}:\\s*(.+(?:\\r?\\n[ \\t]+.+)*)`, "im");
  24. const match = headers.match(regex);
  25. if (!match)
  26. return null;
  27. // Unfold: replace newline+whitespace with single space
  28. return match[1].replace(/\r?\n[ \t]+/g, " ").trim();
  29. }
  30. /**
  31. * Extract filename from Content-Disposition or Content-Type headers.
  32. */
  33. function extractFilename(headers) {
  34. // Try Content-Disposition filename first
  35. const dispHeader = getHeader(headers, "Content-Disposition");
  36. if (dispHeader) {
  37. const fnMatch = dispHeader.match(/filename="?([^";\r\n]+)"?/i);
  38. if (fnMatch)
  39. return fnMatch[1].trim();
  40. }
  41. // Fall back to Content-Type name parameter
  42. const ctHeader = getHeader(headers, "Content-Type");
  43. if (ctHeader) {
  44. const nameMatch = ctHeader.match(/name="?([^";\r\n]+)"?/i);
  45. if (nameMatch)
  46. return nameMatch[1].trim();
  47. }
  48. return null;
  49. }
  50. /**
  51. * Check if a MIME part has inline disposition (not a real attachment).
  52. */
  53. function isInlineDisposition(headers) {
  54. const dispHeader = getHeader(headers, "Content-Disposition");
  55. if (!dispHeader)
  56. return false;
  57. return dispHeader.toLowerCase().startsWith("inline");
  58. }
  59. /**
  60. * Extract size from Content-Disposition size parameter.
  61. */
  62. function extractSize(headers) {
  63. const dispHeader = getHeader(headers, "Content-Disposition");
  64. if (dispHeader) {
  65. const sizeMatch = dispHeader.match(/size=(\d+)/i);
  66. if (sizeMatch)
  67. return parseInt(sizeMatch[1], 10);
  68. }
  69. return 0;
  70. }
  71. /**
  72. * Extract MIME type from Content-Type header.
  73. */
  74. function extractMimeType(headers) {
  75. const ctHeader = getHeader(headers, "Content-Type");
  76. if (!ctHeader)
  77. return "application/octet-stream";
  78. const typeMatch = ctHeader.match(/^([^;\s]+)/);
  79. return typeMatch ? typeMatch[1].toLowerCase() : "application/octet-stream";
  80. }
  81. /**
  82. * Estimate decoded size from base64 content length.
  83. */
  84. function estimateBase64Size(base64Body) {
  85. const cleaned = base64Body.replace(/[\s\r\n]/g, "");
  86. return Math.floor((cleaned.length * 3) / 4);
  87. }
  88. /**
  89. * Split a MIME block into parts using the given boundary.
  90. * Does not recurse — call walkLeafParts for recursive traversal.
  91. */
  92. function splitMimeParts(source, boundary) {
  93. const parts = [];
  94. const boundaryDelim = `--${boundary}`;
  95. const sections = source.split(boundaryDelim);
  96. for (const section of sections) {
  97. const trimmed = section.trim();
  98. if (!trimmed || trimmed.startsWith("--"))
  99. continue;
  100. // Split headers from body at first blank line
  101. const blankLineIdx = trimmed.search(/\r?\n\r?\n/);
  102. if (blankLineIdx === -1)
  103. continue;
  104. const headers = trimmed.substring(0, blankLineIdx);
  105. const body = trimmed.substring(blankLineIdx).replace(/^\r?\n\r?\n/, "");
  106. parts.push({ headers, body });
  107. }
  108. return parts;
  109. }
  110. /**
  111. * Walk a multipart MIME block and return all non-multipart leaf parts,
  112. * descending into nested multipart/* containers (alternative, related, mixed).
  113. */
  114. function walkLeafParts(source, boundary) {
  115. const result = [];
  116. const parts = splitMimeParts(source, boundary);
  117. for (const part of parts) {
  118. const ct = getHeader(part.headers, "Content-Type");
  119. if (ct && /^multipart\//i.test(ct)) {
  120. const nestedBoundary = extractBoundary(ct);
  121. if (nestedBoundary) {
  122. result.push(...walkLeafParts(part.body, nestedBoundary));
  123. continue;
  124. }
  125. }
  126. result.push(part);
  127. }
  128. return result;
  129. }
  130. /**
  131. * Decode a MIME part body to bytes based on its transfer encoding.
  132. * Supports base64, quoted-printable, and 7bit/8bit/binary (raw).
  133. */
  134. function decodeBody(body, encoding) {
  135. const enc = (encoding || "").toLowerCase().trim();
  136. if (enc === "base64") {
  137. return Buffer.from(body.replace(/[\s\r\n]/g, ""), "base64");
  138. }
  139. if (enc === "quoted-printable") {
  140. return decodeQuotedPrintable(body);
  141. }
  142. // 7bit, 8bit, binary, or unspecified — treat as raw bytes
  143. return Buffer.from(body, "binary");
  144. }
  145. /**
  146. * Decode quoted-printable-encoded body to bytes.
  147. * Handles soft line breaks (=<CRLF>) and =XX hex escapes per RFC 2045 §6.7.
  148. */
  149. function decodeQuotedPrintable(body) {
  150. // Remove soft line breaks: `=` immediately followed by CRLF or LF
  151. const noSoft = body.replace(/=\r?\n/g, "");
  152. const bytes = [];
  153. for (let i = 0; i < noSoft.length; i++) {
  154. const c = noSoft[i];
  155. if (c === "=" && i + 2 < noSoft.length) {
  156. const hex = noSoft.substring(i + 1, i + 3);
  157. if (/^[0-9A-Fa-f]{2}$/.test(hex)) {
  158. bytes.push(parseInt(hex, 16));
  159. i += 2;
  160. continue;
  161. }
  162. }
  163. bytes.push(c.charCodeAt(0) & 0xff);
  164. }
  165. return Buffer.from(bytes);
  166. }
  167. /**
  168. * Estimate body size for metadata when Content-Disposition size is absent.
  169. */
  170. function estimateSize(body, encoding) {
  171. const enc = (encoding || "").toLowerCase().trim();
  172. if (enc === "base64")
  173. return estimateBase64Size(body);
  174. // For other encodings the body length is a reasonable proxy
  175. return body.length;
  176. }
  177. /**
  178. * Parse MIME source and return metadata for all file attachments.
  179. * Skips inline dispositions (signature images, etc.). Descends into
  180. * nested multipart/* containers.
  181. *
  182. * @param source - Raw MIME source of the email
  183. * @returns Array of attachment metadata (name, mimeType, size)
  184. */
  185. export function parseMimeAttachments(source) {
  186. if (!source || !source.trim())
  187. return [];
  188. const boundary = extractBoundary(source);
  189. if (!boundary)
  190. return [];
  191. const parts = walkLeafParts(source, boundary);
  192. const attachments = [];
  193. for (const part of parts) {
  194. const filename = extractFilename(part.headers);
  195. if (!filename)
  196. continue;
  197. if (isInlineDisposition(part.headers))
  198. continue;
  199. const encoding = getHeader(part.headers, "Content-Transfer-Encoding");
  200. attachments.push({
  201. name: filename,
  202. mimeType: extractMimeType(part.headers),
  203. size: extractSize(part.headers) || estimateSize(part.body, encoding),
  204. });
  205. }
  206. return attachments;
  207. }
  208. /**
  209. * Extract and decode a specific attachment from MIME source by filename.
  210. * Supports base64, quoted-printable, and 7bit/8bit/binary transfer encodings.
  211. * Descends into nested multipart/* containers.
  212. *
  213. * @param source - Raw MIME source of the email
  214. * @param attachmentName - Filename to extract
  215. * @returns Decoded attachment data, or null if not found
  216. */
  217. export function extractMimeAttachment(source, attachmentName) {
  218. if (!source || !source.trim())
  219. return null;
  220. const boundary = extractBoundary(source);
  221. if (!boundary)
  222. return null;
  223. const parts = walkLeafParts(source, boundary);
  224. for (const part of parts) {
  225. const filename = extractFilename(part.headers);
  226. if (filename !== attachmentName)
  227. continue;
  228. const encoding = getHeader(part.headers, "Content-Transfer-Encoding");
  229. const data = decodeBody(part.body, encoding);
  230. return {
  231. name: filename,
  232. mimeType: extractMimeType(part.headers),
  233. size: extractSize(part.headers) || data.length,
  234. data,
  235. };
  236. }
  237. return null;
  238. }