diff --git a/api/parse.ts b/api/parse.ts deleted file mode 100644 index 7315212..0000000 --- a/api/parse.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Readability } from '@mozilla/readability'; -import type { VercelRequest, VercelResponse } from '@vercel/node'; -import { JSDOM } from 'jsdom'; -import { NodeHtmlMarkdown } from 'node-html-markdown'; - -export default async function handler(req: VercelRequest, res: VercelResponse) { - const { html, url } = req.body; - const doc = new JSDOM(html, { url }); - - const article = new Readability(doc.window.document).parse(); - - const content = NodeHtmlMarkdown.translate(article?.content || '', {}); - - const body = { ...article, content }; - - res.send(body); - - return body; -} diff --git a/api/v1/_utils.ts b/api/v1/_utils.ts new file mode 100644 index 0000000..addf54e --- /dev/null +++ b/api/v1/_utils.ts @@ -0,0 +1,42 @@ +import { Readability } from '@mozilla/readability'; +import { JSDOM } from 'jsdom'; +import { NodeHtmlMarkdown } from 'node-html-markdown'; + +const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io'; +const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN; + +export const htmlToMarkdown = (html: string, url: string) => { + const doc = new JSDOM(html, { url }); + + const article = new Readability(doc.window.document).parse(); + const content = NodeHtmlMarkdown.translate(article?.content || '', {}); + + return { ...article, content }; +}; + +const runner = async ({ url }: { url: string }) => { + const input = { + gotoOptions: { waitUntil: 'networkidle2' }, + url, + }; + + try { + const res = await fetch(`${BASE_URL}/content?token=${BROWSERLESS_TOKEN}`, { + body: JSON.stringify(input), + headers: { + 'Content-Type': 'application/json', + }, + method: 'POST', + }); + const html = await res.text(); + + const article = htmlToMarkdown(html, url); + + return { content: article.content, title: article?.title, url, website: article?.siteName }; + } catch (error) { + console.error(error); + return { content: '抓取失败', errorMessage: (error as any).message, url }; + } +}; + +export default runner; diff --git a/api/v1/index.ts b/api/v1/index.ts new file mode 100644 index 0000000..a27bc4b --- /dev/null +++ b/api/v1/index.ts @@ -0,0 +1,13 @@ +import type { VercelRequest, VercelResponse } from '@vercel/node'; + +import fetchContent from './_utils'; + +export default async function handler(req: VercelRequest, res: VercelResponse) { + if (req.method !== 'POST') { + res.status(405); + } + + const result = await fetchContent(req.body); + + res.send(result); +} diff --git a/api/v1/type.ts b/api/v1/type.ts new file mode 100644 index 0000000..946fb61 --- /dev/null +++ b/api/v1/type.ts @@ -0,0 +1,35 @@ +export type Result = { + content: string; + title?: string; + url: string; + website?: string; +}; + +export interface ParserResponse { + /** author metadata */ + byline: string; + + /** HTML string of processed article content */ + content: string; + + /** content direction */ + dir: string; + + /** article description, or short excerpt from the content */ + excerpt: string; + + /** content language */ + lang: string; + + /** length of an article, in characters */ + length: number; + + /** name of the site */ + siteName: string; + + /** text content of the article, with all the HTML tags removed */ + textContent: string; + + /** article title */ + title: string; +} diff --git a/package.json b/package.json index 5753664..afad0b4 100644 --- a/package.json +++ b/package.json @@ -1,9 +1,9 @@ { - "name": "lobehub-html-parser", + "name": "@lobehub/chat-plugin-web-crawler", "version": "1.0.1", "private": true, - "description": "HTML 转 markdown 服务", - "repository": "https://github.com/arvinxx/vercel-serverless-api-template.git", + "description": "Lobe Chat 网页抓取服务", + "repository": "https://github.com/lobehub/chat-plugin-web-crawler.git", "scripts": { "ci": "npm run lint && npm run type-check", "lint": "npm run lint:js && npm run lint:prettier", diff --git a/tests/parse.test.ts b/tests/parse.test.ts index cf43596..44411ab 100644 --- a/tests/parse.test.ts +++ b/tests/parse.test.ts @@ -1,25 +1,14 @@ -import { VercelRequest, VercelResponse } from '@vercel/node'; import { readFileSync } from 'node:fs'; import * as path from 'node:path'; import { expect } from 'vitest'; -import Api from '../api/parse'; +import { htmlToMarkdown } from '../api/v1/_utils'; describe('html-to-markdown', () => { - it('Zhihu', async () => { + it('Zhihu', () => { const html = readFileSync(path.join(__dirname, './html/zhihu.html'), { encoding: 'utf8' }); - const data = await Api( - ({ - body: { - html, - url: 'https://zhuanlan.zhihu.com/p/641434725', - }, - }), - ({ - send: () => {}, - }), - ); + const data = htmlToMarkdown(html, 'https://zhuanlan.zhihu.com/p/641434725'); expect(data).toMatchSnapshot(); });