feat: parse raw email to readable text body

2026-03-04 16:29:58 +08:00
parent 201e880c36
commit f144afeb80
1 changed files with 167 additions and 1 deletions
--- a/src/index.js
+++ b/src/index.js
@@ -35,6 +35,171 @@ function getHeader(headers, name) {
  }
 }
 function splitEmail(raw) {
  const match = raw.match(/\r?\n\r?\n/);
  if (!match) return { headerText: "", bodyText: raw };
  const idx = match.index;
  const sepLen = match[0].length;
  return {
    headerText: raw.slice(0, idx),
    bodyText: raw.slice(idx + sepLen)
  };
 }
 function parseHeaderText(headerText) {
  const lines = headerText.split(/\r?\n/);
  const headers = {};
  let current = "";
  for (const line of lines) {
    if (/^[ \t]/.test(line) && current) {
      headers[current] += ` ${line.trim()}`;
      continue;
    }
    const idx = line.indexOf(":");
    if (idx <= 0) continue;
    const key = line.slice(0, idx).trim().toLowerCase();
    const value = line.slice(idx + 1).trim();
    current = key;
    headers[key] = headers[key] ? `${headers[key]}, ${value}` : value;
  }
  return headers;
 }
 function getHeaderParam(headerValue, paramName) {
  if (!headerValue) return "";
  const escaped = paramName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  const quoted = new RegExp(`${escaped}="([^"]+)"`, "i");
  const unquoted = new RegExp(`${escaped}=([^;\\s]+)`, "i");
  const m1 = headerValue.match(quoted);
  if (m1) return m1[1];
  const m2 = headerValue.match(unquoted);
  return m2 ? m2[1] : "";
 }
 function bytesToText(bytes, charset = "utf-8") {
  try {
    return new TextDecoder(charset).decode(bytes);
  } catch {
    return new TextDecoder("utf-8").decode(bytes);
  }
 }
 function base64ToBytes(input) {
  const clean = input.replace(/\s+/g, "");
  if (!clean) return new Uint8Array();
  const bin = atob(clean);
  const out = new Uint8Array(bin.length);
  for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
  return out;
 }
 function quotedPrintableToBytes(input) {
  const normalized = input
    .replace(/=\r?\n/g, "")
    .replace(/=([0-9A-Fa-f]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)));
  const out = new Uint8Array(normalized.length);
  for (let i = 0; i < normalized.length; i++) out[i] = normalized.charCodeAt(i) & 0xff;
  return out;
 }
 function decodePartBody(bodyText, transferEncoding, charset) {
  const enc = (transferEncoding || "").toLowerCase();
  if (enc.includes("base64")) {
    return bytesToText(base64ToBytes(bodyText), charset);
  }
  if (enc.includes("quoted-printable")) {
    return bytesToText(quotedPrintableToBytes(bodyText), charset);
  }
  const bytes = new Uint8Array(bodyText.length);
  for (let i = 0; i < bodyText.length; i++) bytes[i] = bodyText.charCodeAt(i) & 0xff;
  return bytesToText(bytes, charset);
 }
 function htmlToText(html) {
  return html
    .replace(/<style[\s\S]*?<\/style>/gi, "")
    .replace(/<script[\s\S]*?<\/script>/gi, "")
    .replace(/<br\s*\/?\s*>/gi, "\n")
    .replace(/<\/p>/gi, "\n")
    .replace(/<[^>]+>/g, "")
    .replace(/&nbsp;/gi, " ")
    .replace(/&amp;/gi, "&")
    .replace(/&lt;/gi, "<")
    .replace(/&gt;/gi, ">")
    .replace(/\r/g, "")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
 }
 function splitMultipartParts(bodyText, boundary) {
  const delimiter = `--${boundary}`;
  const closing = `--${boundary}--`;
  const lines = bodyText.split(/\r?\n/);
  const parts = [];
  let collecting = false;
  let current = [];
  for (const line of lines) {
    if (line === delimiter || line === closing) {
      if (collecting && current.length) {
        parts.push(current.join("\n").trim());
        current = [];
      }
      collecting = line !== closing;
      continue;
    }
    if (collecting) current.push(line);
  }
  if (current.length) parts.push(current.join("\n").trim());
  return parts;
 }
 function extractReadableBody(raw) {
  const walk = (rawText) => {
    const { headerText, bodyText } = splitEmail(rawText);
    const headers = parseHeaderText(headerText);
    const contentType = (headers["content-type"] || "text/plain").toLowerCase();
    const transferEncoding = headers["content-transfer-encoding"] || "";
    const charset = getHeaderParam(headers["content-type"] || "", "charset") || "utf-8";
    if (contentType.includes("multipart/")) {
      const boundary = getHeaderParam(headers["content-type"] || "", "boundary");
      if (!boundary) return bodyText.trim();
      const parts = splitMultipartParts(bodyText, boundary);
      const plainTexts = [];
      const htmlTexts = [];
      for (const part of parts) {
        const parsed = walk(part);
        if (!parsed.text) continue;
        if (parsed.kind === "text/plain") plainTexts.push(parsed.text);
        else if (parsed.kind === "text/html") htmlTexts.push(parsed.text);
      }
      if (plainTexts.length) return { kind: "text/plain", text: plainTexts.join("\n\n").trim() };
      if (htmlTexts.length) return { kind: "text/plain", text: htmlToText(htmlTexts.join("\n\n")).trim() };
      return { kind: "text/plain", text: bodyText.trim() };
    }
    const decoded = decodePartBody(bodyText, transferEncoding, charset).trim();
    if (contentType.includes("text/html")) return { kind: "text/html", text: decoded };
    return { kind: "text/plain", text: decoded };
  };
  const result = walk(raw);
  if (!result || !result.text) return raw;
  return result.kind === "text/html" ? htmlToText(result.text) : result.text;
 }
 const ONE_DAY_MS = 24 * 60 * 60 * 1000;
 export default {
@@ -88,7 +253,8 @@ export default {
    const subject = getHeader(message.headers, "subject");
    const id = getHeader(message.headers, "message-id") || crypto.randomUUID();
    const nexthop = recipient.includes("@") ? recipient.split("@")[1] : "";
-    const content = await streamToString(message.raw);
+    const raw = await streamToString(message.raw);
    const content = extractReadableBody(raw) || raw;
    const received_at = Date.now();
    await env.DB.prepare(