langchain-ai · hwchase17 · May 31, 2023 · May 27, 2023 · May 29, 2023 · May 31, 2023
diff --git a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.\n",
     "\n",
-    "The scraping is done concurrently.  There are reasonable limits to concurrent requests, defaulting to 2 per second.  If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load, you can change the `requests_per_second` parameter to increase the max concurrent requests.  Note, while this will speed up the scraping process, but it may cause the server to block you.  Be careful!"
+    "The scraping is done concurrently.  There are reasonable limits to concurrent requests, defaulting to 2 per second.  If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load. Note, while this will speed up the scraping process, but it may cause the server to block you.  Be careful!"
    ]
   },
   {
@@ -63,6 +63,25 @@
     "docs = sitemap_loader.load()"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can change the `requests_per_second` parameter to increase the max concurrent requests. and use `requests_kwargs` to pass kwargs when send requests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sitemap_loader.requests_per_second = 2\n",
+    "# Optional: avoid `[SSL: CERTIFICATE_VERIFY_FAILED]` issue\n",
+    "sitemap_loader.requests_kwargs = {\"verify\": False}"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,

diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py
@@ -2,7 +2,7 @@
 import asyncio
 import logging
 import warnings
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import aiohttp
 import requests
@@ -47,6 +47,9 @@ class WebBaseLoader(BaseLoader):
     default_parser: str = "html.parser"
     """Default parser to use for BeautifulSoup."""
 
+    requests_kwargs: Dict[str, Any] = {}
+    """kwargs for requests"""
+
     def __init__(
         self, web_path: Union[str, List[str]], header_template: Optional[dict] = None
     ):
@@ -170,7 +173,7 @@ def _scrape(self, url: str, parser: Union[str, None] = None) -> Any:
 
         self._check_parser(parser)
 
-        html_doc = self.session.get(url)
+        html_doc = self.session.get(url, **self.requests_kwargs)
         html_doc.encoding = html_doc.apparent_encoding
         return BeautifulSoup(html_doc.text, parser)