Skip to content

Commit

Permalink
Import documents - language dialog and guessing
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 19, 2022
1 parent cdf2309 commit 534b2cc
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 7 deletions.
56 changes: 49 additions & 7 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
QVBoxLayout, QLabel, QGridLayout, QSizePolicy, QCompleter
)
from numpy import array
from orangewidget.settings import ContextHandler, Context

from orangewidget.utils.itemmodels import PyListModel

Expand All @@ -37,17 +38,19 @@
ThreadExecutor, FutureWatcher, methodinvoke
)
from Orange.widgets.widget import Output
from orangecanvas.preview.previewbrowser import TextLabel

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, \
NoDocumentsException

try:
from orangecanvas.preview.previewbrowser import TextLabel
except ImportError:
from Orange.canvas.preview.previewbrowser import TextLabel
from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException
from orangecontrib.text.language import (
iso2lang,
detect_language,
lang2iso,
LanguageModel,
)

# domain for skipped images output

SKIPPED_DOMAIN = Domain([], metas=[
StringVariable("name"),
StringVariable("path")
Expand Down Expand Up @@ -85,6 +88,26 @@ class State(enum.IntEnum):
NoState, Processing, Done, Cancelled, Error = range(5)


class ImportDocumentContextHandler(ContextHandler):
"""Context handler that matches hashes of documents"""

@staticmethod
def corpus_hash(corpus: Corpus) -> int:
"""Compute hash of all documents in the Corpus"""
return hash(tuple(corpus.documents))

def new_context(self, corpus: Corpus) -> Context:
context = super().new_context()
context.documents_hash = self.corpus_hash(corpus)
return context

# noinspection PyMethodOverriding
def match(self, context: Context, corpus: Corpus) -> int:
if context.documents_hash == self.corpus_hash(corpus):
return self.PERFECT_MATCH
return self.NO_MATCH


class OWImportDocuments(widget.OWWidget):
name = "Import Documents"
description = "Import text documents from folders."
Expand All @@ -95,6 +118,8 @@ class Outputs:
data = Output("Corpus", Corpus, default=True)
skipped_documents = Output("Skipped documents", Table)

settingsHandler = ImportDocumentContextHandler()

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
#: list of recent paths
Expand All @@ -104,6 +129,7 @@ class Outputs:
lemma_cb = settings.Setting(True)
pos_cb = settings.Setting(False)
ner_cb = settings.Setting(False)
language: str = settings.ContextSetting("English")

want_main_area = False
resizing_enabled = False
Expand Down Expand Up @@ -217,6 +243,17 @@ def __init__(self):
lambda: reloadbutton.setEnabled(reloadaction.isEnabled())
)

gui.comboBox(
self.controlArea,
self,
"language",
box="Language",
model=LanguageModel(),
sendSelectedValue=True,
searchable=True,
callback=self.commit,
)

box = gui.hBox(self.controlArea, "Conllu import options")
gui.checkBox(box, self, "lemma_cb", "Lemma",
callback=self.commit)
Expand Down Expand Up @@ -535,6 +572,7 @@ def start(self):
self.error()
self.Warning.clear()
self.progress_widget.setValue(0)
self.closeContext()

self.__invalidated = False
startdir = self.currentPath if self.source == self.LOCAL_FILE \
Expand Down Expand Up @@ -625,6 +663,8 @@ def __onRunFinished(self):
self.n_text_categories = len(corpus.domain.class_var.values) \
if corpus.domain.class_var else 0

self.language = iso2lang[corpus.language or detect_language(corpus)]
self.openContext(corpus)
self.base_corpus = self.corpus = corpus
self.is_conllu = is_conllu
self.tokens = lemmas
Expand Down Expand Up @@ -681,6 +721,8 @@ def commit(self):
"""
if self.is_conllu:
self.add_features()
if self.corpus:
self.corpus.attributes["language"] = lang2iso[self.language]
self.Outputs.data.send(self.corpus)
if self.skipped_documents:
skipped_table = (
Expand Down
33 changes: 33 additions & 0 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from unittest.mock import patch, Mock

from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.tests.utils import simulate

from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


Expand Down Expand Up @@ -117,6 +119,37 @@ def test_load_empty_folder(self):
self.wait_until_finished(widget=widget)
self.assertIsNone(self.get_output(widget.Outputs.data))

def tests_context(self):
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
# change default to something else to see if language is changed
self.widget.language = "Slovene"

path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

# english is recognized for selected documents
self.assertEqual(self.widget.language, "English")
self.assertEqual("en", self.get_output(self.widget.Outputs.data).language)
simulate.combobox_activate_item(self.widget.controls.language, "Dutch")

self.assertEqual(self.widget.language, "Dutch")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)

# read something else
path1 = os.path.join(os.path.dirname(__file__), "data/conllu")
self.widget.setCurrentPath(path1)
self.widget.reload()
self.wait_until_finished()

# read same data again and observe if context is restored
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
self.assertEqual(self.widget.language, "Dutch")
self.assertEqual("nl", self.get_output(self.widget.Outputs.data).language)


if __name__ == "__main__":
unittest.main()

0 comments on commit 534b2cc

Please sign in to comment.