Skip to content

Commit

Permalink
Merge pull request #336 from snippet/allow-external-content-links
Browse files Browse the repository at this point in the history
[Proposal] new feature allowExternalContentLinks
  • Loading branch information
rafaelsideguide authored Jul 2, 2024
2 parents 0821017 + db4a743 commit f0f449f
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 16 deletions.
40 changes: 40 additions & 0 deletions apps/api/src/__tests__/e2e_full_withAuth/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,46 @@ describe("E2E Tests for API Routes", () => {
expect(completedResponse.body.data[0].metadata.pageError).toBeUndefined();
}, 180000);

it.concurrent("should crawl external content links when allowed", async () => {
const crawlInitResponse = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://mendable.ai",
crawlerOptions: {
allowExternalContentLinks: true,
ignoreSitemap: true,
returnOnlyUrls: true,
limit: 50
}
});

expect(crawlInitResponse.statusCode).toBe(200);
expect(crawlInitResponse.body).toHaveProperty("jobId");

let crawlStatus: string;
let crawlData = [];
while (crawlStatus !== "completed") {
const statusResponse = await request(TEST_URL)
.get(`/v0/crawl/status/${crawlInitResponse.body.jobId}`)
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
crawlStatus = statusResponse.body.status;
if (statusResponse.body.data) {
crawlData = statusResponse.body.data;
}
if (crawlStatus !== "completed") {
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
}
}
console.log(crawlData)
expect(crawlData.length).toBeGreaterThan(0);
expect(crawlData).toEqual(expect.arrayContaining([
expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }),
expect.objectContaining({ url: expect.stringContaining("https://mendable.ai/pricing") }),
expect.objectContaining({ url: expect.stringContaining("https://x.com/CalebPeffer") })
]));
}, 180000); // 3 minutes timeout
});

describe("POST /v0/crawlWebsitePreview", () => {
Expand Down
1 change: 1 addition & 0 deletions apps/api/src/lib/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export type CrawlerOptions = {
ignoreSitemap?: boolean;
mode?: "default" | "fast"; // have a mode of some sort
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
}

export type WebScraperOptions = {
Expand Down
82 changes: 66 additions & 16 deletions apps/api/src/scraper/WebScraper/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export class WebCrawler {
private robots: any;
private generateImgAltText: boolean;
private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean;

constructor({
initialUrl,
Expand All @@ -32,7 +33,8 @@ export class WebCrawler {
limit = 10000,
generateImgAltText = false,
maxCrawledDepth = 10,
allowBackwardCrawling = false
allowBackwardCrawling = false,
allowExternalContentLinks = false
}: {
initialUrl: string;
includes?: string[];
Expand All @@ -42,6 +44,7 @@ export class WebCrawler {
generateImgAltText?: boolean;
maxCrawledDepth?: number;
allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean;
}) {
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin;
Expand All @@ -55,6 +58,7 @@ export class WebCrawler {
this.maxCrawledDepth = maxCrawledDepth ?? 10;
this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
}

private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
Expand Down Expand Up @@ -98,9 +102,10 @@ export class WebCrawler {
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');

// Ensure the protocol and hostname match, and the path starts with the initial URL's path
if (linkHostname !== initialHostname) {
return false;
}
// commented to able to handling external link on allowExternalContentLinks
// if (linkHostname !== initialHostname) {
// return false;
// }

if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
Expand Down Expand Up @@ -278,15 +283,24 @@ export class WebCrawler {
const path = urlObj.pathname;


if (
this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
// this.matchesIncludes(path) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
}
}
});
Expand Down Expand Up @@ -320,9 +334,41 @@ export class WebCrawler {
return this.includes.some((pattern) => new RegExp(pattern).test(url));
}

private matchesExcludes(url: string): boolean {
if (this.excludes.length === 0 || this.excludes[0] == "") return false;
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
return this.excludes.some((pattern) => {
if (onlyDomains)
return this.matchesExcludesExternalDomains(url);

return this.excludes.some((pattern) => new RegExp(pattern).test(url));
});
}

// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains(url: string) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;

for (let domain of this.excludes) {
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
let domainHostname = domainObj.hostname;
let domainPathname = domainObj.pathname;

if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
if (pathname.startsWith(domainPathname)) {
return true;
}
}
}
return false;
} catch (e) {
return false;
}
}

private isExternalMainPage(url:string):boolean {
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
}

private noSections(link: string): boolean {
Expand Down Expand Up @@ -375,6 +421,10 @@ export class WebCrawler {
"instagram.com",
"pinterest.com",
"mailto:",
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
];
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
Expand Down
3 changes: 3 additions & 0 deletions apps/api/src/scraper/WebScraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export class WebScraperDataProvider {
"gpt-4-turbo";
private crawlerMode: string = "default";
private allowBackwardCrawling: boolean = false;
private allowExternalContentLinks: boolean = false;

authorize(): void {
throw new Error("Method not implemented.");
Expand Down Expand Up @@ -173,6 +174,7 @@ export class WebScraperDataProvider {
limit: this.limit,
generateImgAltText: this.generateImgAltText,
allowBackwardCrawling: this.allowBackwardCrawling,
allowExternalContentLinks: this.allowExternalContentLinks,
});

let links = await crawler.start(
Expand Down Expand Up @@ -496,6 +498,7 @@ export class WebScraperDataProvider {
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false;
this.allowBackwardCrawling = options.crawlerOptions?.allowBackwardCrawling ?? false;
this.allowExternalContentLinks = options.crawlerOptions?.allowExternalContentLinks ?? false;

// make sure all urls start with https://
this.urls = this.urls.map((url) => {
Expand Down

0 comments on commit f0f449f

Please sign in to comment.