community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
langchain-ai · Jul 23, 2024 · 5ac936a · 5ac936a
1 parent 3c4652c
commit 5ac936a
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 8 deletions.
diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py
@@ -6,20 +6,24 @@
     HierarchyInput,
     HierarchyLinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
-    HtmlInput,
-    HtmlLinkExtractor,
-)
 from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
     KeybertInput,
     KeybertLinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor import (
+
+from .html_link_extractor import (
+    HtmlInput,
+    HtmlLinkExtractor,
+)
+from .link_extractor import (
     LinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+from .link_extractor_adapter import (
     LinkExtractorAdapter,
 )
+from .link_extractor_transformer import (
+    LinkExtractorTransformer,
+)
 
 __all__ = [
     "GLiNERInput",
@@ -34,4 +38,5 @@
     "LinkExtractor",
     "LinkExtractorAdapter",
     "LinkExtractorAdapter",
+    "LinkExtractorTransformer",
 ]
diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
@@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
     """Interface for extracting links (incoming, outgoing, bidirectional)."""
 
     @abstractmethod
-    def extract_one(self, input: InputT) -> set[Link]:  # noqa: A002
+    def extract_one(self, input: InputT) -> Set[Link]:
         """Add edges from each `input` to the corresponding documents.
 
         Args:

diff --git a/...community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py b/...community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
@@ -0,0 +1,43 @@
+from typing import Any, Iterable, Sequence
+
+from langchain_core.documents import Document
+from langchain_core.documents.transformers import BaseDocumentTransformer
+from langchain_core.graph_vectorstores.links import copy_with_links
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+
+
+class LinkExtractorTransformer(BaseDocumentTransformer):
+    """DocumentTransformer for applying one or more LinkExtractors.
+
+    Example:
+        .. code-block:: python
+
+            extract_links = LinkExtractorTransformer([
+                HtmlLinkExtractor().as_document_extractor(),
+            ])
+            extract_links.transform_documents(docs)
+    """
+
+    def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
+        """Create a DocumentTransformer which adds extracted links to each document."""
+        self.link_extractors = link_extractors
+
+    def transform_documents(
+        self, documents: Sequence[Document], **kwargs: Any
+    ) -> Sequence[Document]:
+        # Implement `transform_docments` directly, so that LinkExtractors which operate
+        # better in batch (`extract_many`) get a chance to do so.
+
+        # Run each extractor over all documents.
+        links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
+
+        # Transpose the list of lists to pair each document with the tuple of links.
+        links_per_document = zip(*links_per_extractor)
+
+        return [
+            copy_with_links(document, *links)
+            for document, links in zip(documents, links_per_document)
+        ]
diff --git a/...mmunity/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py b/...mmunity/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
@@ -0,0 +1,92 @@
+from typing import Set
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link, get_links
+
+from langchain_community.graph_vectorstores.extractors import (
+    LinkExtractor,
+    LinkExtractorTransformer,
+)
+
+TEXT1 = "Text1"
+TEXT2 = "Text2"
+
+
+class FakeKeywordExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        kws: Set[str] = set()
+        if input.page_content == TEXT1:
+            kws = {"a", "b"}
+        elif input.page_content == TEXT2:
+            kws = {"b", "c"}
+
+        return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
+
+
+class FakeHyperlinkExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        if input.page_content == TEXT1:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text1"),
+                Link.outgoing(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        elif input.page_content == TEXT2:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        else:
+            raise ValueError(
+                f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
+            )
+
+
+def test_one_extractor() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+        ]
+    )
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+    }
+
+
+def test_multiple_extractors() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+            FakeHyperlinkExtractor(),
+        ]
+    )
+
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.incoming(kind="fakehref", tag="http://text1"),
+        Link.outgoing(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+        Link.incoming(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }
diff --git a/libs/core/langchain_core/graph_vectorstores/links.py b/libs/core/langchain_core/graph_vectorstores/links.py
@@ -12,7 +12,7 @@ class Link:
     """
 
     kind: str
-    """The kind of link. Allows different extractors to use the same tag name without 
+    """The kind of link. Allows different extractors to use the same tag name without
     creating collisions between extractors. For example “keyword” vs “url”."""
     direction: Literal["in", "out", "bidir"]
     """The direction of the link."""
@@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
             links_in_metadata.extend(link)
         else:
             links_in_metadata.append(link)
+
+
+def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
+    """Return a document with the given links added.
+
+    Args:
+        doc: The document to add the links to.
+        *links: The links to add to the document.
+
+    Returns:
+        A document with a shallow-copy of the metadata with the links added.
+    """
+    new_links = set(get_links(doc))
+    for link in links:
+        if isinstance(link, Iterable):
+            new_links.update(link)
+        else:
+            new_links.add(link)
+
+    return Document(
+        page_content=doc.page_content,
+        metadata={
+            **doc.metadata,
+            METADATA_LINKS_KEY: list(new_links),
+        },
+    )