-
Notifications
You must be signed in to change notification settings - Fork 14.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
community[minor]: add document transformer for extracting links (#24186)
- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
- Loading branch information
1 parent
3c4652c
commit 5ac936a
Showing
5 changed files
with
174 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
...community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import Any, Iterable, Sequence | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.documents.transformers import BaseDocumentTransformer | ||
from langchain_core.graph_vectorstores.links import copy_with_links | ||
|
||
from langchain_community.graph_vectorstores.extractors.link_extractor import ( | ||
LinkExtractor, | ||
) | ||
|
||
|
||
class LinkExtractorTransformer(BaseDocumentTransformer): | ||
"""DocumentTransformer for applying one or more LinkExtractors. | ||
Example: | ||
.. code-block:: python | ||
extract_links = LinkExtractorTransformer([ | ||
HtmlLinkExtractor().as_document_extractor(), | ||
]) | ||
extract_links.transform_documents(docs) | ||
""" | ||
|
||
def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]): | ||
"""Create a DocumentTransformer which adds extracted links to each document.""" | ||
self.link_extractors = link_extractors | ||
|
||
def transform_documents( | ||
self, documents: Sequence[Document], **kwargs: Any | ||
) -> Sequence[Document]: | ||
# Implement `transform_docments` directly, so that LinkExtractors which operate | ||
# better in batch (`extract_many`) get a chance to do so. | ||
|
||
# Run each extractor over all documents. | ||
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors] | ||
|
||
# Transpose the list of lists to pair each document with the tuple of links. | ||
links_per_document = zip(*links_per_extractor) | ||
|
||
return [ | ||
copy_with_links(document, *links) | ||
for document, links in zip(documents, links_per_document) | ||
] |
92 changes: 92 additions & 0 deletions
92
...mmunity/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
from typing import Set | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.graph_vectorstores.links import Link, get_links | ||
|
||
from langchain_community.graph_vectorstores.extractors import ( | ||
LinkExtractor, | ||
LinkExtractorTransformer, | ||
) | ||
|
||
TEXT1 = "Text1" | ||
TEXT2 = "Text2" | ||
|
||
|
||
class FakeKeywordExtractor(LinkExtractor[Document]): | ||
def extract_one(self, input: Document) -> Set[Link]: | ||
kws: Set[str] = set() | ||
if input.page_content == TEXT1: | ||
kws = {"a", "b"} | ||
elif input.page_content == TEXT2: | ||
kws = {"b", "c"} | ||
|
||
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws} | ||
|
||
|
||
class FakeHyperlinkExtractor(LinkExtractor[Document]): | ||
def extract_one(self, input: Document) -> Set[Link]: | ||
if input.page_content == TEXT1: | ||
return { | ||
Link.incoming(kind="fakehref", tag="http://text1"), | ||
Link.outgoing(kind="fakehref", tag="http://text2"), | ||
Link.outgoing(kind="fakehref", tag="http://text3"), | ||
} | ||
elif input.page_content == TEXT2: | ||
return { | ||
Link.incoming(kind="fakehref", tag="http://text2"), | ||
Link.outgoing(kind="fakehref", tag="http://text3"), | ||
} | ||
else: | ||
raise ValueError( | ||
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'" | ||
) | ||
|
||
|
||
def test_one_extractor() -> None: | ||
transformer = LinkExtractorTransformer( | ||
[ | ||
FakeKeywordExtractor(), | ||
] | ||
) | ||
doc1 = Document(TEXT1) | ||
doc2 = Document(TEXT2) | ||
results = transformer.transform_documents([doc1, doc2]) | ||
|
||
assert set(get_links(results[0])) == { | ||
Link.bidir(kind="fakekw", tag="a"), | ||
Link.bidir(kind="fakekw", tag="b"), | ||
} | ||
|
||
assert set(get_links(results[1])) == { | ||
Link.bidir(kind="fakekw", tag="b"), | ||
Link.bidir(kind="fakekw", tag="c"), | ||
} | ||
|
||
|
||
def test_multiple_extractors() -> None: | ||
transformer = LinkExtractorTransformer( | ||
[ | ||
FakeKeywordExtractor(), | ||
FakeHyperlinkExtractor(), | ||
] | ||
) | ||
|
||
doc1 = Document(TEXT1) | ||
doc2 = Document(TEXT2) | ||
|
||
results = transformer.transform_documents([doc1, doc2]) | ||
|
||
assert set(get_links(results[0])) == { | ||
Link.bidir(kind="fakekw", tag="a"), | ||
Link.bidir(kind="fakekw", tag="b"), | ||
Link.incoming(kind="fakehref", tag="http://text1"), | ||
Link.outgoing(kind="fakehref", tag="http://text2"), | ||
Link.outgoing(kind="fakehref", tag="http://text3"), | ||
} | ||
|
||
assert set(get_links(results[1])) == { | ||
Link.bidir(kind="fakekw", tag="b"), | ||
Link.bidir(kind="fakekw", tag="c"), | ||
Link.incoming(kind="fakehref", tag="http://text2"), | ||
Link.outgoing(kind="fakehref", tag="http://text3"), | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters