feat: parse raw email to readable text body
This commit is contained in:
168
src/index.js
168
src/index.js
@@ -35,6 +35,171 @@ function getHeader(headers, name) {
|
||||
}
|
||||
}
|
||||
|
||||
function splitEmail(raw) {
|
||||
const match = raw.match(/\r?\n\r?\n/);
|
||||
if (!match) return { headerText: "", bodyText: raw };
|
||||
const idx = match.index;
|
||||
const sepLen = match[0].length;
|
||||
return {
|
||||
headerText: raw.slice(0, idx),
|
||||
bodyText: raw.slice(idx + sepLen)
|
||||
};
|
||||
}
|
||||
|
||||
function parseHeaderText(headerText) {
|
||||
const lines = headerText.split(/\r?\n/);
|
||||
const headers = {};
|
||||
let current = "";
|
||||
|
||||
for (const line of lines) {
|
||||
if (/^[ \t]/.test(line) && current) {
|
||||
headers[current] += ` ${line.trim()}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
const idx = line.indexOf(":");
|
||||
if (idx <= 0) continue;
|
||||
const key = line.slice(0, idx).trim().toLowerCase();
|
||||
const value = line.slice(idx + 1).trim();
|
||||
current = key;
|
||||
headers[key] = headers[key] ? `${headers[key]}, ${value}` : value;
|
||||
}
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
function getHeaderParam(headerValue, paramName) {
|
||||
if (!headerValue) return "";
|
||||
const escaped = paramName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const quoted = new RegExp(`${escaped}="([^"]+)"`, "i");
|
||||
const unquoted = new RegExp(`${escaped}=([^;\\s]+)`, "i");
|
||||
const m1 = headerValue.match(quoted);
|
||||
if (m1) return m1[1];
|
||||
const m2 = headerValue.match(unquoted);
|
||||
return m2 ? m2[1] : "";
|
||||
}
|
||||
|
||||
function bytesToText(bytes, charset = "utf-8") {
|
||||
try {
|
||||
return new TextDecoder(charset).decode(bytes);
|
||||
} catch {
|
||||
return new TextDecoder("utf-8").decode(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
function base64ToBytes(input) {
|
||||
const clean = input.replace(/\s+/g, "");
|
||||
if (!clean) return new Uint8Array();
|
||||
const bin = atob(clean);
|
||||
const out = new Uint8Array(bin.length);
|
||||
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
|
||||
return out;
|
||||
}
|
||||
|
||||
function quotedPrintableToBytes(input) {
|
||||
const normalized = input
|
||||
.replace(/=\r?\n/g, "")
|
||||
.replace(/=([0-9A-Fa-f]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)));
|
||||
|
||||
const out = new Uint8Array(normalized.length);
|
||||
for (let i = 0; i < normalized.length; i++) out[i] = normalized.charCodeAt(i) & 0xff;
|
||||
return out;
|
||||
}
|
||||
|
||||
function decodePartBody(bodyText, transferEncoding, charset) {
|
||||
const enc = (transferEncoding || "").toLowerCase();
|
||||
|
||||
if (enc.includes("base64")) {
|
||||
return bytesToText(base64ToBytes(bodyText), charset);
|
||||
}
|
||||
|
||||
if (enc.includes("quoted-printable")) {
|
||||
return bytesToText(quotedPrintableToBytes(bodyText), charset);
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(bodyText.length);
|
||||
for (let i = 0; i < bodyText.length; i++) bytes[i] = bodyText.charCodeAt(i) & 0xff;
|
||||
return bytesToText(bytes, charset);
|
||||
}
|
||||
|
||||
function htmlToText(html) {
|
||||
return html
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
||||
.replace(/<br\s*\/?\s*>/gi, "\n")
|
||||
.replace(/<\/p>/gi, "\n")
|
||||
.replace(/<[^>]+>/g, "")
|
||||
.replace(/ /gi, " ")
|
||||
.replace(/&/gi, "&")
|
||||
.replace(/</gi, "<")
|
||||
.replace(/>/gi, ">")
|
||||
.replace(/\r/g, "")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function splitMultipartParts(bodyText, boundary) {
|
||||
const delimiter = `--${boundary}`;
|
||||
const closing = `--${boundary}--`;
|
||||
const lines = bodyText.split(/\r?\n/);
|
||||
const parts = [];
|
||||
let collecting = false;
|
||||
let current = [];
|
||||
|
||||
for (const line of lines) {
|
||||
if (line === delimiter || line === closing) {
|
||||
if (collecting && current.length) {
|
||||
parts.push(current.join("\n").trim());
|
||||
current = [];
|
||||
}
|
||||
collecting = line !== closing;
|
||||
continue;
|
||||
}
|
||||
if (collecting) current.push(line);
|
||||
}
|
||||
|
||||
if (current.length) parts.push(current.join("\n").trim());
|
||||
return parts;
|
||||
}
|
||||
|
||||
function extractReadableBody(raw) {
|
||||
const walk = (rawText) => {
|
||||
const { headerText, bodyText } = splitEmail(rawText);
|
||||
const headers = parseHeaderText(headerText);
|
||||
const contentType = (headers["content-type"] || "text/plain").toLowerCase();
|
||||
const transferEncoding = headers["content-transfer-encoding"] || "";
|
||||
const charset = getHeaderParam(headers["content-type"] || "", "charset") || "utf-8";
|
||||
|
||||
if (contentType.includes("multipart/")) {
|
||||
const boundary = getHeaderParam(headers["content-type"] || "", "boundary");
|
||||
if (!boundary) return bodyText.trim();
|
||||
|
||||
const parts = splitMultipartParts(bodyText, boundary);
|
||||
const plainTexts = [];
|
||||
const htmlTexts = [];
|
||||
|
||||
for (const part of parts) {
|
||||
const parsed = walk(part);
|
||||
if (!parsed.text) continue;
|
||||
if (parsed.kind === "text/plain") plainTexts.push(parsed.text);
|
||||
else if (parsed.kind === "text/html") htmlTexts.push(parsed.text);
|
||||
}
|
||||
|
||||
if (plainTexts.length) return { kind: "text/plain", text: plainTexts.join("\n\n").trim() };
|
||||
if (htmlTexts.length) return { kind: "text/plain", text: htmlToText(htmlTexts.join("\n\n")).trim() };
|
||||
return { kind: "text/plain", text: bodyText.trim() };
|
||||
}
|
||||
|
||||
const decoded = decodePartBody(bodyText, transferEncoding, charset).trim();
|
||||
if (contentType.includes("text/html")) return { kind: "text/html", text: decoded };
|
||||
return { kind: "text/plain", text: decoded };
|
||||
};
|
||||
|
||||
const result = walk(raw);
|
||||
if (!result || !result.text) return raw;
|
||||
return result.kind === "text/html" ? htmlToText(result.text) : result.text;
|
||||
}
|
||||
|
||||
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
|
||||
|
||||
export default {
|
||||
@@ -88,7 +253,8 @@ export default {
|
||||
const subject = getHeader(message.headers, "subject");
|
||||
const id = getHeader(message.headers, "message-id") || crypto.randomUUID();
|
||||
const nexthop = recipient.includes("@") ? recipient.split("@")[1] : "";
|
||||
const content = await streamToString(message.raw);
|
||||
const raw = await streamToString(message.raw);
|
||||
const content = extractReadableBody(raw) || raw;
|
||||
const received_at = Date.now();
|
||||
|
||||
await env.DB.prepare(
|
||||
|
||||
Reference in New Issue
Block a user