feat: 引入 pdf.js,支持 PDF 文本提取内联

This commit is contained in:
zimk
2026-03-17 00:00:16 +08:00
parent c6f2c99a02
commit 9bb830c277
2 changed files with 25 additions and 1 deletions

View File

@@ -364,6 +364,23 @@ async function addPendingFiles(files) {
} catch (err) { } catch (err) {
console.warn('pptx 解析失败', err); console.warn('pptx 解析失败', err);
} }
} else if (kind === 'pdf') {
try {
const arrayBuffer = await readFileAsArrayBuffer(file);
const pdfjsLib = window['pdfjs-dist/build/pdf'];
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.worker.min.js';
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
const pageTexts = [];
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items.map((item) => item.str).join(' ');
if (pageText.trim()) pageTexts.push(`[第 ${i} 页]\n${pageText}`);
}
parsedText = pageTexts.join('\n\n') || 'PDF 文本提取为空,可能是扫描件)';
} catch (err) {
console.warn('pdf.js 解析失败', err);
}
} }
const dataUrl = await readFileAsDataURL(file); const dataUrl = await readFileAsDataURL(file);
@@ -539,8 +556,14 @@ function buildUserMessage(message, attachments) {
type: 'text', type: 'text',
text: `文件名:${file.name}\n内容:\n\`\`\`\n${decoded}\n\`\`\``, text: `文件名:${file.name}\n内容:\n\`\`\`\n${decoded}\n\`\`\``,
}); });
} else if ((file.kind === 'docx' || file.kind === 'xlsx' || file.kind === 'pptx' || file.kind === 'pdf') && file.parsedText != null) {
// Office 文档 / PDF使用前端解析出的文本内联
content.push({
type: 'text',
text: `文件名:${file.name}\n内容:\n\`\`\`\n${file.parsedText}\n\`\`\``,
});
} else { } else {
// PDF / 其他二进制:记录下来,后面统一追加描述 // 其他二进制:记录下来,后面统一追加描述
nonInlineable.push(file); nonInlineable.push(file);
} }
}); });

View File

@@ -91,6 +91,7 @@
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/mammoth@1.8.0/mammoth.browser.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/mammoth@1.8.0/mammoth.browser.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/xlsx@0.18.5/dist/xlsx.full.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/xlsx@0.18.5/dist/xlsx.full.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.min.js"></script>
<script src="/config.js"></script> <script src="/config.js"></script>
<script src="/app.js" defer></script> <script src="/app.js" defer></script>
</body> </body>