feat: parse raw email to readable text body
This commit is contained in:
168
src/index.js
168
src/index.js
@@ -35,6 +35,171 @@ function getHeader(headers, name) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function splitEmail(raw) {
|
||||||
|
const match = raw.match(/\r?\n\r?\n/);
|
||||||
|
if (!match) return { headerText: "", bodyText: raw };
|
||||||
|
const idx = match.index;
|
||||||
|
const sepLen = match[0].length;
|
||||||
|
return {
|
||||||
|
headerText: raw.slice(0, idx),
|
||||||
|
bodyText: raw.slice(idx + sepLen)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseHeaderText(headerText) {
|
||||||
|
const lines = headerText.split(/\r?\n/);
|
||||||
|
const headers = {};
|
||||||
|
let current = "";
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (/^[ \t]/.test(line) && current) {
|
||||||
|
headers[current] += ` ${line.trim()}`;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const idx = line.indexOf(":");
|
||||||
|
if (idx <= 0) continue;
|
||||||
|
const key = line.slice(0, idx).trim().toLowerCase();
|
||||||
|
const value = line.slice(idx + 1).trim();
|
||||||
|
current = key;
|
||||||
|
headers[key] = headers[key] ? `${headers[key]}, ${value}` : value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getHeaderParam(headerValue, paramName) {
|
||||||
|
if (!headerValue) return "";
|
||||||
|
const escaped = paramName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||||
|
const quoted = new RegExp(`${escaped}="([^"]+)"`, "i");
|
||||||
|
const unquoted = new RegExp(`${escaped}=([^;\\s]+)`, "i");
|
||||||
|
const m1 = headerValue.match(quoted);
|
||||||
|
if (m1) return m1[1];
|
||||||
|
const m2 = headerValue.match(unquoted);
|
||||||
|
return m2 ? m2[1] : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function bytesToText(bytes, charset = "utf-8") {
|
||||||
|
try {
|
||||||
|
return new TextDecoder(charset).decode(bytes);
|
||||||
|
} catch {
|
||||||
|
return new TextDecoder("utf-8").decode(bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function base64ToBytes(input) {
|
||||||
|
const clean = input.replace(/\s+/g, "");
|
||||||
|
if (!clean) return new Uint8Array();
|
||||||
|
const bin = atob(clean);
|
||||||
|
const out = new Uint8Array(bin.length);
|
||||||
|
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function quotedPrintableToBytes(input) {
|
||||||
|
const normalized = input
|
||||||
|
.replace(/=\r?\n/g, "")
|
||||||
|
.replace(/=([0-9A-Fa-f]{2})/g, (_m, hex) => String.fromCharCode(parseInt(hex, 16)));
|
||||||
|
|
||||||
|
const out = new Uint8Array(normalized.length);
|
||||||
|
for (let i = 0; i < normalized.length; i++) out[i] = normalized.charCodeAt(i) & 0xff;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodePartBody(bodyText, transferEncoding, charset) {
|
||||||
|
const enc = (transferEncoding || "").toLowerCase();
|
||||||
|
|
||||||
|
if (enc.includes("base64")) {
|
||||||
|
return bytesToText(base64ToBytes(bodyText), charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (enc.includes("quoted-printable")) {
|
||||||
|
return bytesToText(quotedPrintableToBytes(bodyText), charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
const bytes = new Uint8Array(bodyText.length);
|
||||||
|
for (let i = 0; i < bodyText.length; i++) bytes[i] = bodyText.charCodeAt(i) & 0xff;
|
||||||
|
return bytesToText(bytes, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
function htmlToText(html) {
|
||||||
|
return html
|
||||||
|
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
||||||
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
||||||
|
.replace(/<br\s*\/?\s*>/gi, "\n")
|
||||||
|
.replace(/<\/p>/gi, "\n")
|
||||||
|
.replace(/<[^>]+>/g, "")
|
||||||
|
.replace(/ /gi, " ")
|
||||||
|
.replace(/&/gi, "&")
|
||||||
|
.replace(/</gi, "<")
|
||||||
|
.replace(/>/gi, ">")
|
||||||
|
.replace(/\r/g, "")
|
||||||
|
.replace(/\n{3,}/g, "\n\n")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function splitMultipartParts(bodyText, boundary) {
|
||||||
|
const delimiter = `--${boundary}`;
|
||||||
|
const closing = `--${boundary}--`;
|
||||||
|
const lines = bodyText.split(/\r?\n/);
|
||||||
|
const parts = [];
|
||||||
|
let collecting = false;
|
||||||
|
let current = [];
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (line === delimiter || line === closing) {
|
||||||
|
if (collecting && current.length) {
|
||||||
|
parts.push(current.join("\n").trim());
|
||||||
|
current = [];
|
||||||
|
}
|
||||||
|
collecting = line !== closing;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (collecting) current.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.length) parts.push(current.join("\n").trim());
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractReadableBody(raw) {
|
||||||
|
const walk = (rawText) => {
|
||||||
|
const { headerText, bodyText } = splitEmail(rawText);
|
||||||
|
const headers = parseHeaderText(headerText);
|
||||||
|
const contentType = (headers["content-type"] || "text/plain").toLowerCase();
|
||||||
|
const transferEncoding = headers["content-transfer-encoding"] || "";
|
||||||
|
const charset = getHeaderParam(headers["content-type"] || "", "charset") || "utf-8";
|
||||||
|
|
||||||
|
if (contentType.includes("multipart/")) {
|
||||||
|
const boundary = getHeaderParam(headers["content-type"] || "", "boundary");
|
||||||
|
if (!boundary) return bodyText.trim();
|
||||||
|
|
||||||
|
const parts = splitMultipartParts(bodyText, boundary);
|
||||||
|
const plainTexts = [];
|
||||||
|
const htmlTexts = [];
|
||||||
|
|
||||||
|
for (const part of parts) {
|
||||||
|
const parsed = walk(part);
|
||||||
|
if (!parsed.text) continue;
|
||||||
|
if (parsed.kind === "text/plain") plainTexts.push(parsed.text);
|
||||||
|
else if (parsed.kind === "text/html") htmlTexts.push(parsed.text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (plainTexts.length) return { kind: "text/plain", text: plainTexts.join("\n\n").trim() };
|
||||||
|
if (htmlTexts.length) return { kind: "text/plain", text: htmlToText(htmlTexts.join("\n\n")).trim() };
|
||||||
|
return { kind: "text/plain", text: bodyText.trim() };
|
||||||
|
}
|
||||||
|
|
||||||
|
const decoded = decodePartBody(bodyText, transferEncoding, charset).trim();
|
||||||
|
if (contentType.includes("text/html")) return { kind: "text/html", text: decoded };
|
||||||
|
return { kind: "text/plain", text: decoded };
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = walk(raw);
|
||||||
|
if (!result || !result.text) return raw;
|
||||||
|
return result.kind === "text/html" ? htmlToText(result.text) : result.text;
|
||||||
|
}
|
||||||
|
|
||||||
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
|
const ONE_DAY_MS = 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
@@ -88,7 +253,8 @@ export default {
|
|||||||
const subject = getHeader(message.headers, "subject");
|
const subject = getHeader(message.headers, "subject");
|
||||||
const id = getHeader(message.headers, "message-id") || crypto.randomUUID();
|
const id = getHeader(message.headers, "message-id") || crypto.randomUUID();
|
||||||
const nexthop = recipient.includes("@") ? recipient.split("@")[1] : "";
|
const nexthop = recipient.includes("@") ? recipient.split("@")[1] : "";
|
||||||
const content = await streamToString(message.raw);
|
const raw = await streamToString(message.raw);
|
||||||
|
const content = extractReadableBody(raw) || raw;
|
||||||
const received_at = Date.now();
|
const received_at = Date.now();
|
||||||
|
|
||||||
await env.DB.prepare(
|
await env.DB.prepare(
|
||||||
|
|||||||
Reference in New Issue
Block a user