Merge branch 'main' into simplify_qa_example

IBM · Sep 25, 2024 · e7ffeba · e7ffeba
2 parents 3d81403 + 80b284f
commit e7ffeba
Show file tree

Hide file tree

Showing 60 changed files with 899 additions and 443 deletions.
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2024-08-21T15:51:06Z",
+  "generated_at": "2024-09-21T12:22:01Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -82,7 +82,7 @@
         "hashed_secret": "fa172616e9af3d2a24b5597f264eab963fe76889",
         "is_secret": false,
         "is_verified": false,
-        "line_number": 1946,
+        "line_number": 1955,
         "type": "Hex High Entropy String",
         "verified_result": null
       }

diff --git a/examples/run_generic_inference_engine.py b/examples/run_generic_inference_engine.py
@@ -1,20 +1,52 @@
-from unitxt import get_logger, produce
-from unitxt.inference import GenericInferenceEngine
+from unitxt import get_logger, produce  # Import necessary functions from unitxt
+from unitxt.inference import GenericInferenceEngine  # Import the inference engine class
 
 if __name__ == "__main__":
-    generic_engine = GenericInferenceEngine(
-        default="engines.ibm_gen_ai.llama_3_8b_instruct"
+    # Create an instance of the GenericInferenceEngine with a default engine.
+    # This means if no engine is specified during inference, it will default to this one.
+    generic_engine_with_default = GenericInferenceEngine(
+        default="engines.ibm_gen_ai.llama_3_70b_instruct"
     )
+
+    # Define the recipe for data processing and model selection.
+    # - card: Specifies the underlying data (from cards.almost_evil).
+    # - template: Selects the specific template within the card (from templates.qa.open.simple).
+    # - demos_pool_size and num_demos: Control the number of demonstration examples used (set to 0 here).
     recipe = "card=cards.almost_evil,template=templates.qa.open.simple,demos_pool_size=0,num_demos=0"
+
+    # Create a list of instances (data points) for inference.
+    # Each instance has a "question" and its corresponding "answers".
     instances = [
-        {"question": "How many days there are in a week", "answers": ["7"]},
         {
-            "question": "If a ate an apple in the morning, and one in the evening, how many apples did I eat?",
+            "question": "How many days there are in a week, answer only with numerals",
+            "answers": ["7"],
+        },
+        {
+            "question": "If a ate an apple in the morning, and one in the evening, what is the number of apples I have eaten?, answer only with numerals",
             "answers": ["2"],
         },
     ]
+
+    # Process the instances using the defined recipe.
+    # This likely formats the data according to the chosen card and template.
     dataset = produce(instances, recipe)
 
-    predictions = generic_engine.infer(dataset)
+    # Perform inference on the processed dataset using the engine with the default model.
+    predictions = generic_engine_with_default.infer(dataset)
+    get_logger().info(predictions)  # Log the predictions
+
+    # The following code block demonstrates how to use the GenericInferenceEngine without specifying a
+    # default engine. It expects the engine to be defined in the UNITXT_INFERENCE_ENGINE environment variable.
+    try:
+        # Attempt to create an instance without a default engine.
+        generic_engine_without_default = GenericInferenceEngine()
 
-    get_logger().info(predictions)
+        # Perform inference (will use the engine specified in the environment variable).
+        predictions = generic_engine_without_default.infer(dataset)
+        get_logger().info(predictions)  # Log the predictions
+    except:
+        # Handle the case where the environment variable is not set.
+        get_logger().error(
+            "GenericInferenceEngine could not be initialized without a default since "
+            "UNITXT_INFERENCE_ENGINE environmental variable is not set."
+        )
diff --git a/prepare/cards/fin_qa.py b/prepare/cards/fin_qa.py
@@ -1,38 +1,35 @@
 from unitxt.blocks import (
     LoadHF,
-    SerializeTableAsIndexedRowMajor,
     TaskCard,
     TemplatesList,
 )
 from unitxt.catalog import add_to_catalog
-from unitxt.operators import CopyFields, FilterByExpression
+from unitxt.operators import Copy, FilterByExpression
 from unitxt.struct_data_operators import MapTableListsToStdTableJSON
 from unitxt.task import Task
 from unitxt.templates import InputOutputTemplate
 from unitxt.test_utils.card import test_card
+from unitxt.types import Table
 
 card = TaskCard(
     loader=LoadHF(path="ibm/finqa", streaming=False),
     preprocess_steps=[
         FilterByExpression(expression="len(table) > 1"),
-        CopyFields(field_to_field=[["pre_text/0", "pre_text"]]),
-        CopyFields(field_to_field=[["post_text/0", "post_text"]]),
-        MapTableListsToStdTableJSON(field_to_field=[["table", "stdtable"]]),
-        SerializeTableAsIndexedRowMajor(
-            field_to_field=[["stdtable", "serialized_table"]]
-        ),
+        Copy(field="pre_text/0", to_field="pre_text"),
+        Copy(field="post_text/0", to_field="post_text"),
+        MapTableListsToStdTableJSON(field="table"),
     ],
     task=Task(
         inputs={
             "pre_text": str,
-            "serialized_table": str,
+            "table": Table,
             "post_text": str,
             "question": str,
         },
         outputs={"program_re": str, "answer": str},
         prediction_type=str,
         metrics=["metrics.fin_qa_metric"],
-        augmentable_inputs=["pre_text", "serialized_table", "post_text", "question"],
+        augmentable_inputs=["pre_text", "table", "post_text", "question"],
     ),
     templates=TemplatesList(
         [
@@ -52,7 +49,7 @@
                 ["table-min", "table header", "number", "the minimum number of one table row"]]
                 Answer with only the program, without any additional explanation.
                 Pre-table text: {pre_text}
-                Table: {serialized_table}
+                Table: {table}
                 Post-table text: {post_text}
                 Question: {question}
                 Program:

diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py
@@ -2,16 +2,16 @@
     LoadHF,
     MapHTMLTableToJSON,
     Rename,
-    SerializeTableAsMarkdown,
     Set,
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
+from unitxt.operators import Copy
 from unitxt.templates import TemplatesList
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadHF(path="kasnerz/numericnlg"),  # TODO: load from github repo
+    loader=LoadHF(path="kasnerz/numericnlg"),
     preprocess_steps=[
         Set(
             fields={
@@ -21,7 +21,7 @@
             }
         ),
         MapHTMLTableToJSON(field="table_html_clean", to_field="table_out"),
-        SerializeTableAsMarkdown(field="table_out", to_field="input_a"),
+        Copy(field="table_out", to_field="input_a"),
         Rename(field="description", to_field="output"),
         Rename(field="caption", to_field="input_b"),
     ],

diff --git a/prepare/cards/rag/end_to_end/__init__.py b/prepare/cards/rag/end_to_end/__init__.py
diff --git a/prepare/cards/scigen.py b/prepare/cards/scigen.py
@@ -3,7 +3,6 @@
     ConstructTableFromRowsCols,
     LoadHF,
     Rename,
-    SerializeTableAsIndexedRowMajor,
     Set,
     TaskCard,
 )
@@ -16,9 +15,8 @@
         FilterByCondition(values={"table_content_values": "[]"}, condition="ne"),
         ConstructTableFromRowsCols(
             fields=["table_column_names", "table_content_values"],
-            to_field="table",
+            to_field="input_a",
         ),
-        SerializeTableAsIndexedRowMajor(field_to_field=[["table", "input_a"]]),
         Rename(field_to_field={"table_caption": "input_b", "text": "output"}),
         Set(
             fields={

diff --git a/prepare/cards/tab_fact.py b/prepare/cards/tab_fact.py
@@ -2,7 +2,6 @@
     LoadHF,
     MapInstanceValues,
     Rename,
-    SerializeTableAsIndexedRowMajor,
     Set,
     TaskCard,
 )
@@ -16,8 +15,7 @@
         path="ibm/tab_fact", streaming=False, data_classification_policy=["public"]
     ),
     preprocess_steps=[
-        SerializeTableAsIndexedRowMajor(field_to_field=[["table", "table_serialized"]]),
-        Rename(field_to_field={"table_serialized": "text_a", "statement": "text_b"}),
+        Rename(field_to_field={"table": "text_a", "statement": "text_b"}),
         MapInstanceValues(mappers={"label": {"0": "refuted", "1": "entailed"}}),
         Set(
             fields={

diff --git a/prepare/cards/turl_col_type.py b/prepare/cards/turl_col_type.py
@@ -3,29 +3,25 @@
 from unitxt.blocks import (
     InputOutputTemplate,
     LoadHF,
-    SerializeTableAsIndexedRowMajor,
     Task,
     TaskCard,
     TemplatesList,
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.test_utils.card import test_card
+from unitxt.types import Table
 
 card = TaskCard(
     loader=LoadHF(
         path="ibm/turl_table_col_type",
-        streaming=False,
         data_classification_policy=["public"],
     ),
-    preprocess_steps=[
-        SerializeTableAsIndexedRowMajor(field_to_field=[["table", "table_lin"]])
-    ],
     task=Task(
         input_fields={
             "page_title": str,
             "section_title": str,
             "table_caption": str,
-            "table_lin": str,
+            "table": Table,
             "vocab": List[str],
             "colname": str,
         },
@@ -41,7 +37,7 @@
         [
             InputOutputTemplate(
                 input_format="""
-                    This is a column type annotation task. The goal of this task is to choose the correct types for one selected column of the given input table from the given candidate types. The Wikipedia page, section and table caption (if any) provide important information for choosing the correct column types. \nPage Title: {page_title} \nSection Title: {section_title} \nTable caption: {table_caption} \nTable: \n{table_lin} \nSelected Column: {colname} \nCandidate Types: {vocab} \nOutput only the correct column types for this column (column name: {colname}) from the candidate types.
+                    This is a column type annotation task. The goal of this task is to choose the correct types for one selected column of the given input table from the given candidate types. The Wikipedia page, section and table caption (if any) provide important information for choosing the correct column types. \nPage Title: {page_title} \nSection Title: {section_title} \nTable caption: {table_caption} \nTable: \n{table} \nSelected Column: {colname} \nCandidate Types: {vocab} \nOutput only the correct column types for this column (column name: {colname}) from the candidate types.
                 """.strip(),
                 output_format="{annotations}",
                 postprocessors=["processors.to_list_by_comma"],

diff --git a/prepare/cards/wikitq.py b/prepare/cards/wikitq.py
@@ -1,10 +1,9 @@
 from unitxt.blocks import (
     LoadHF,
-    SerializeTableAsIndexedRowMajor,
-    Set,
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
+from unitxt.operators import Copy, Set
 from unitxt.templates import MultiReferenceTemplate, TemplatesList
 from unitxt.test_utils.card import test_card
 
@@ -15,10 +14,7 @@
     ),
     preprocess_steps=[
         Set({"context_type": "table"}),
-        ## truncate only if needed as it can impact evaluation results.
-        # TruncateTableCells(max_length=15, table="table", text_output="answers"),
-        # TruncateTableRows(field="table", rows_to_keep=50),
-        SerializeTableAsIndexedRowMajor(field_to_field=[["table", "context"]]),
+        Copy(field="table", to_field="context"),
     ],
     task="tasks.qa.with_context.extractive[metrics=[metrics.f1_strings, metrics.unsorted_list_exact_match]]",
     templates=TemplatesList(

diff --git a/prepare/engines/ibm_wml/llama3.py b/prepare/engines/ibm_wml/llama3.py
@@ -0,0 +1,11 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.inference import WMLInferenceEngine
+
+model_list = ["meta-llama/llama-3-70b-instruct"]
+
+for model in model_list:
+    model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
+    inference_model = WMLInferenceEngine(
+        model_name=model, max_new_tokens=2048, random_seed=42
+    )
+    add_to_catalog(inference_model, f"engines.ibm_wml.{model_label}", overwrite=True)
diff --git a/prepare/engines/ollama/__init__.py b/prepare/engines/ollama/__init__.py
diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py
@@ -0,0 +1,62 @@
+from unitxt import add_to_catalog
+from unitxt.inference import (
+    GenericInferenceEngine,
+    IbmGenAiInferenceEngine,
+    WMLInferenceEngine,
+)
+from unitxt.llm_as_judge import LLMAsJudge
+
+model_list = ["meta-llama/llama-3-8b-instruct", "meta-llama/llama-3-70b-instruct"]
+format = "formats.llama3_instruct"
+templates = [
+    "templates.response_assessment.pairwise_comparative_rating.arena_hard",
+    "templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling",
+]
+
+inference_engines = [
+    ("ibm_wml", WMLInferenceEngine),
+    ("ibm_genai", IbmGenAiInferenceEngine),
+    ("generic_engine", GenericInferenceEngine),
+]
+
+
+for template in templates:
+    task = "pairwise_comparative_rating.single_turn"
+
+    for model_id in model_list:
+        for inference_engine_name, inference_engine in inference_engines:
+            if (
+                inference_engine_name == "ibm_wml"
+                and model_id == "meta-llama/llama-3-8b-instruct"
+            ):
+                continue  # currently not supported
+
+            # if inference engine is generic, these configurations will be defined when it is saved to the catalog
+            if inference_engine_name != "generic_engine":
+                inference_model = inference_engine(
+                    model_name=model_id, max_new_tokens=2048, random_seed=42
+                )
+            else:
+                inference_model = inference_engine(
+                    default="engines.ibm_gen_ai.llama_3_70b_instruct"
+                )
+
+            model_label = (
+                model_id.split("/")[1].replace("-", "_").replace(".", ",").lower()
+            )
+            model_label = f"{model_label}_{inference_engine_name}"
+            template_label = template.split(".")[-1]
+            metric_label = f"{model_label}_template_{template_label}"
+            metric = LLMAsJudge(
+                inference_model=inference_model,
+                template=template,
+                task=task,
+                format=format,
+                main_score=metric_label,
+            )
+
+            add_to_catalog(
+                metric,
+                f"metrics.llm_as_judge.pairwise_comparative_rating.{model_label}_template_{template_label}",
+                overwrite=True,
+            )
diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_ibm_genai_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_ibm_genai_arena_hard_template.py
diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py
@@ -1,7 +1,8 @@
-from typing import List
+from typing import List, Union
 
-from unitxt.blocks import Task
 from unitxt.catalog import add_to_catalog
+from unitxt.task import Task
+from unitxt.types import Audio, Dialog, Image, Table, Text
 
 add_to_catalog(
     Task(
@@ -79,7 +80,7 @@
 add_to_catalog(
     Task(
         input_fields={
-            "text_a": str,
+            "text_a": Union[Text, Image, Audio, Table, Dialog],
             "text_a_type": str,
             "text_b": str,
             "text_b_type": str,