Skip to content

Commit

Permalink
community[minor]: add document transformer for extracting links (#24186)
Browse files Browse the repository at this point in the history
- **Description:** Add a DocumentTransformer for executing one or more
`LinkExtractor`s and adding the extracted links to each document.
- **Issue:** n/a
- **Depedencies:** none

---------

Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
  • Loading branch information
bjchambers and eyurtsev committed Jul 23, 2024
1 parent 3c4652c commit 5ac936a
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,24 @@
HierarchyInput,
HierarchyLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
KeybertInput,
KeybertLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (

from .html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from .link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
from .link_extractor_adapter import (
LinkExtractorAdapter,
)
from .link_extractor_transformer import (
LinkExtractorTransformer,
)

__all__ = [
"GLiNERInput",
Expand All @@ -34,4 +38,5 @@
"LinkExtractor",
"LinkExtractorAdapter",
"LinkExtractorAdapter",
"LinkExtractorTransformer",
]
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""

@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
def extract_one(self, input: InputT) -> Set[Link]:
"""Add edges from each `input` to the corresponding documents.
Args:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Any, Iterable, Sequence

from langchain_core.documents import Document
from langchain_core.documents.transformers import BaseDocumentTransformer
from langchain_core.graph_vectorstores.links import copy_with_links

from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)


class LinkExtractorTransformer(BaseDocumentTransformer):
"""DocumentTransformer for applying one or more LinkExtractors.
Example:
.. code-block:: python
extract_links = LinkExtractorTransformer([
HtmlLinkExtractor().as_document_extractor(),
])
extract_links.transform_documents(docs)
"""

def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
"""Create a DocumentTransformer which adds extracted links to each document."""
self.link_extractors = link_extractors

def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
# Implement `transform_docments` directly, so that LinkExtractors which operate
# better in batch (`extract_many`) get a chance to do so.

# Run each extractor over all documents.
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]

# Transpose the list of lists to pair each document with the tuple of links.
links_per_document = zip(*links_per_extractor)

return [
copy_with_links(document, *links)
for document, links in zip(documents, links_per_document)
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from typing import Set

from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link, get_links

from langchain_community.graph_vectorstores.extractors import (
LinkExtractor,
LinkExtractorTransformer,
)

TEXT1 = "Text1"
TEXT2 = "Text2"


class FakeKeywordExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
kws: Set[str] = set()
if input.page_content == TEXT1:
kws = {"a", "b"}
elif input.page_content == TEXT2:
kws = {"b", "c"}

return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}


class FakeHyperlinkExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
if input.page_content == TEXT1:
return {
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
elif input.page_content == TEXT2:
return {
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
else:
raise ValueError(
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
)


def test_one_extractor() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
]
)
doc1 = Document(TEXT1)
doc2 = Document(TEXT2)
results = transformer.transform_documents([doc1, doc2])

assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
}

assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
}


def test_multiple_extractors() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
FakeHyperlinkExtractor(),
]
)

doc1 = Document(TEXT1)
doc2 = Document(TEXT2)

results = transformer.transform_documents([doc1, doc2])

assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}

assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
28 changes: 27 additions & 1 deletion libs/core/langchain_core/graph_vectorstores/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Link:
"""

kind: str
"""The kind of link. Allows different extractors to use the same tag name without
"""The kind of link. Allows different extractors to use the same tag name without
creating collisions between extractors. For example “keyword” vs “url”."""
direction: Literal["in", "out", "bidir"]
"""The direction of the link."""
Expand Down Expand Up @@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
links_in_metadata.extend(link)
else:
links_in_metadata.append(link)


def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
"""Return a document with the given links added.
Args:
doc: The document to add the links to.
*links: The links to add to the document.
Returns:
A document with a shallow-copy of the metadata with the links added.
"""
new_links = set(get_links(doc))
for link in links:
if isinstance(link, Iterable):
new_links.update(link)
else:
new_links.add(link)

return Document(
page_content=doc.page_content,
metadata={
**doc.metadata,
METADATA_LINKS_KEY: list(new_links),
},
)

0 comments on commit 5ac936a

Please sign in to comment.