feat: parse raw email to readable text body

This commit is contained in:
zimk
2026-03-04 16:29:58 +08:00
parent 201e880c36
commit f144afeb80

View File

@@ -35,6 +35,171 @@ function getHeader(headers, name) {
} }
} }
function splitEmail(raw) {
const match = raw.match(/\r?\n\r?\n/);
if (!match) return { headerText: "", bodyText: raw };
const idx = match.index;
const sepLen = match[0].length;
return {
headerText: raw.slice(0, idx),
bodyText: raw.slice(idx + sepLen)
};
}
function parseHeaderText(headerText) {
const lines = headerText.split(/\r?\n/);
const headers = {};
let current = "";
for (const line of lines) {
if (/^[ \t]/.test(line) && current) {
headers[current] += ` ${line.trim()}`;
continue;
}
const idx = line.indexOf(":");
if (idx <= 0) continue;
const key = line.slice(0, idx).trim().toLowerCase();
const value = line.slice(idx + 1).trim();
current = key;
headers[key] = headers[key] ? `${headers[key]}, ${value}` : value;
}
return headers;
}
function getHeaderParam(headerValue, paramName) {
if (!headerValue) return "";
const escaped = paramName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const quoted = new RegExp(`${escaped}="([^"]+)"`, "i");
const unquoted = new RegExp(`${escaped}=([^;\\s]+)`, "i");
const m1 = headerValue.match(quoted);
if (m1) return m1[1];
const m2 = headerValue.match(unquoted);
return m2 ? m2[1] : "";
}
function bytesToText(bytes, charset = "utf-8") {
try {
return new TextDecoder(charset).decode(bytes);
} catch {
return new TextDecoder("utf-8").decode(bytes);
}
}
function base64ToBytes(input) {
const clean = input.replace(/\s+/g, "");
if (!clean) return new Uint8Array();
const bin = atob(clean);
const out = new Uint8Array(bin.length);
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
return out;
}
function quotedPrintableToBytes(input) {
const normalized = input
.replace(/=\r?\n/g, "")
.replace(/=([0-9A-Fa-f]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)));
const out = new Uint8Array(normalized.length);
for (let i = 0; i < normalized.length; i++) out[i] = normalized.charCodeAt(i) & 0xff;
return out;
}
function decodePartBody(bodyText, transferEncoding, charset) {
const enc = (transferEncoding || "").toLowerCase();
if (enc.includes("base64")) {
return bytesToText(base64ToBytes(bodyText), charset);
}
if (enc.includes("quoted-printable")) {
return bytesToText(quotedPrintableToBytes(bodyText), charset);
}
const bytes = new Uint8Array(bodyText.length);
for (let i = 0; i < bodyText.length; i++) bytes[i] = bodyText.charCodeAt(i) & 0xff;
return bytesToText(bytes, charset);
}
function htmlToText(html) {
return html
.replace(/<style[\s\S]*?<\/style>/gi, "")
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<br\s*\/?\s*>/gi, "\n")
.replace(/<\/p>/gi, "\n")
.replace(/<[^>]+>/g, "")
.replace(/&nbsp;/gi, " ")
.replace(/&amp;/gi, "&")
.replace(/&lt;/gi, "<")
.replace(/&gt;/gi, ">")
.replace(/\r/g, "")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function splitMultipartParts(bodyText, boundary) {
const delimiter = `--${boundary}`;
const closing = `--${boundary}--`;
const lines = bodyText.split(/\r?\n/);
const parts = [];
let collecting = false;
let current = [];
for (const line of lines) {
if (line === delimiter || line === closing) {
if (collecting && current.length) {
parts.push(current.join("\n").trim());
current = [];
}
collecting = line !== closing;
continue;
}
if (collecting) current.push(line);
}
if (current.length) parts.push(current.join("\n").trim());
return parts;
}
function extractReadableBody(raw) {
const walk = (rawText) => {
const { headerText, bodyText } = splitEmail(rawText);
const headers = parseHeaderText(headerText);
const contentType = (headers["content-type"] || "text/plain").toLowerCase();
const transferEncoding = headers["content-transfer-encoding"] || "";
const charset = getHeaderParam(headers["content-type"] || "", "charset") || "utf-8";
if (contentType.includes("multipart/")) {
const boundary = getHeaderParam(headers["content-type"] || "", "boundary");
if (!boundary) return bodyText.trim();
const parts = splitMultipartParts(bodyText, boundary);
const plainTexts = [];
const htmlTexts = [];
for (const part of parts) {
const parsed = walk(part);
if (!parsed.text) continue;
if (parsed.kind === "text/plain") plainTexts.push(parsed.text);
else if (parsed.kind === "text/html") htmlTexts.push(parsed.text);
}
if (plainTexts.length) return { kind: "text/plain", text: plainTexts.join("\n\n").trim() };
if (htmlTexts.length) return { kind: "text/plain", text: htmlToText(htmlTexts.join("\n\n")).trim() };
return { kind: "text/plain", text: bodyText.trim() };
}
const decoded = decodePartBody(bodyText, transferEncoding, charset).trim();
if (contentType.includes("text/html")) return { kind: "text/html", text: decoded };
return { kind: "text/plain", text: decoded };
};
const result = walk(raw);
if (!result || !result.text) return raw;
return result.kind === "text/html" ? htmlToText(result.text) : result.text;
}
const ONE_DAY_MS = 24 * 60 * 60 * 1000; const ONE_DAY_MS = 24 * 60 * 60 * 1000;
export default { export default {
@@ -88,7 +253,8 @@ export default {
const subject = getHeader(message.headers, "subject"); const subject = getHeader(message.headers, "subject");
const id = getHeader(message.headers, "message-id") || crypto.randomUUID(); const id = getHeader(message.headers, "message-id") || crypto.randomUUID();
const nexthop = recipient.includes("@") ? recipient.split("@")[1] : ""; const nexthop = recipient.includes("@") ? recipient.split("@")[1] : "";
const content = await streamToString(message.raw); const raw = await streamToString(message.raw);
const content = extractReadableBody(raw) || raw;
const received_at = Date.now(); const received_at = Date.now();
await env.DB.prepare( await env.DB.prepare(