diff --git a/examples/config/classify_text_gold.yaml b/examples/config/classify_text_gold.yaml new file mode 100644 index 0000000..728e919 --- /dev/null +++ b/examples/config/classify_text_gold.yaml @@ -0,0 +1,39 @@ +name: classify_text +data: + file: data/classify_text_data_gold.tsv + input: + text: str + output: + result: str + gold: + result: known_answer +options: + positive: Positive + negative: Negative + neutral: Neutral +interface: + prompt: Read the text and assign it to the most appropriate category. +project: + # id: 129368 + setup: + public_name: Classify text into categories + public_description: Read the text and assign it to the most appropriate category. + instructions: instructions/classify_text_instructions.html +pool: + # id: 1387049 + estimated_time_per_suite: 60 + setup: + private_name: Classify text + reward_per_assignment: 0.2 + assignment_max_duration_seconds: 600 + auto_accept_solutions: true + defaults: + default_overlap_for_new_tasks: 1 + default_overlap_for_new_task_suites: 1 + mixer: + real_tasks_count: 1 + golden_tasks_count: 1 + training_tasks_count: 0 + filter: + languages: + - EN \ No newline at end of file diff --git a/examples/data/classify_text_data_gold.tsv b/examples/data/classify_text_data_gold.tsv new file mode 100644 index 0000000..d3aec05 --- /dev/null +++ b/examples/data/classify_text_data_gold.tsv @@ -0,0 +1,5 @@ +text known_answer +This product is really bad. I returned it immediately and don't recommend it to anyone. negative +I love my new PlayStation 2! It has the best games ever! positive +The customer service in this shop is not the best, but the products are good. +I will never visit this restaurant again! \ No newline at end of file diff --git a/examples/gold_demo.py b/examples/gold_demo.py new file mode 100644 index 0000000..975bf0c --- /dev/null +++ b/examples/gold_demo.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +from abulafia.task_specs import TaskSequence, TextClassification +import argparse +import json +import toloka.client as toloka + +# Set up the argument parser +ap = argparse.ArgumentParser() + +# Add argument for input +ap.add_argument("-c", "--creds", required=True, + help="Path to a JSON file that contains Toloka credentials. " + "The file should have two keys: 'token' and 'mode'. " + "The key 'token' should contain the Toloka API key, whereas " + "the key 'mode' should have the value 'PRODUCTION' or 'SANDBOX' " + "that defines the environment in which the pipeline should be run.") + +# Parse the arguments +args = vars(ap.parse_args()) + +# Assign arguments to variables +cred_file = args['creds'] + +# Read the credentials from the JSON file +with open(cred_file) as cred_f: + + creds = json.loads(cred_f.read()) + tclient = toloka.TolokaClient(creds['token'], creds['mode']) + +# Create a TextClassification task using the configuration file +classify_text = TextClassification(configuration='config/classify_text_gold.yaml', client=tclient) + +# Add the tasks into a TaskSequence +pipe = TaskSequence(sequence=[classify_text], client=tclient) + +# Start the task sequence; create tasks on Toloka +pipe.start() diff --git a/pyproject.toml b/pyproject.toml index 5ef31c4..5efb277 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "abulafia" -version = "0.1.7" +version = "0.1.8" description = "A tool for fair and reproducible crowdsourcing using Toloka" readme = "README.md" requires-python = ">=3.8" diff --git a/src/abulafia/functions/core_functions.py b/src/abulafia/functions/core_functions.py index 73189de..e0885b4 100644 --- a/src/abulafia/functions/core_functions.py +++ b/src/abulafia/functions/core_functions.py @@ -37,21 +37,62 @@ def create_tasks(input_obj, # Print status message msg.info(f'Creating and adding tasks to pool with ID {input_obj.pool.id}') - # Fetch input variable names from the configuration. Create a dictionary with matching - # key and value pairs, which is updated when creating the toloka.Task objects below. + # Fetch input/output variable names from the configuration. Create dictionaries with matching + # key and value pairs, which is updated when creating the Task objects below. input_values = {n: n for n in list(input_obj.conf['data']['input'].keys())} + output_values = {n: n for n in list(input_obj.conf['data']['output'].keys())} - assert set(input_values.keys()) == set(input_data.columns), raise_error(f"Input data column names " - f"do not match input configuration " - f"for the pool {input_obj.name}!") + assert set(input_values.keys()).issubset(set(input_data.columns)), \ + raise_error(f"Could not find the columns defined in the input configuration in the " + f"file with the input data for pool {input_obj.name}!") - # Create a list of Toloka Task objects by looping over the input DataFrame. Use the - # dictionary of input variable names 'input_values' to retrieve the correct columns - # from the DataFrame. - tasks = [toloka.Task(pool_id=input_obj.pool.id, - input_values={k: row[v] for k, v in input_values.items()}, - unavailable_for=input_obj.blocklist) - for _, row in input_data.iterrows()] + # Check if golden answers exist + if 'gold' in input_obj.conf['data']: + + # Print status message + msg.info(f'Found golden tasks in pool with ID {input_obj.pool.id}') + + assert set(input_obj.conf['data']['gold'].keys()).issubset(set(output_values.keys())), \ + raise_error(f"Could not find the output column defined in the configuration for " + f"golden answers in pool {input_obj.name}. Remember that the configuration " + f"for golden answers must be given as a key/value pair. The key " + f"names the *output* column for which the golden answers are provided. The " + f"value determines the column that holds the golden answers. For example, " + f"if the golden answers for the column 'result' are stored under column " + f"'gold' in the input file, the key value pair must be 'result: gold'.") + + # Get the dictionary for golden answers + gold_dict = input_obj.conf['data']['gold'] + + # Break the golden answer and normal input data to two DataFrames + gold_data = input_data.dropna(subset=list(gold_dict.values())) + input_data.drop(gold_data.index, axis=0, inplace=True) + + # Create golden tasks + tasks = [toloka.Task(pool_id=input_obj.pool.id, + input_values={k: row[v] for k, v in input_values.items()}, + known_solutions=[toloka.task.BaseTask.KnownSolution( + output_values={k: str(row[v]) for k, v in gold_dict.items()})], + unavailable_for=input_obj.blocklist + ) + for _, row in gold_data.iterrows()] + + # Create input tasks + tasks.extend([toloka.Task(pool_id=input_obj.pool.id, + input_values={k: row[v] for k, v in input_values.items()}, + unavailable_for=input_obj.blocklist) + for _, row in input_data.iterrows()]) + + # If no golden answers are provided, continue to create the input tasks + else: + + # Create a list of Toloka Task objects by looping over the input DataFrame. Use the + # dictionary of input variable names 'input_values' to retrieve the correct columns + # from the DataFrame. + tasks = [toloka.Task(pool_id=input_obj.pool.id, + input_values={k: row[v] for k, v in input_values.items()}, + unavailable_for=input_obj.blocklist) + for _, row in input_data.iterrows()] return tasks diff --git a/src/abulafia/task_specs/task_specs.py b/src/abulafia/task_specs/task_specs.py index 62d51f6..696f7a0 100644 --- a/src/abulafia/task_specs/task_specs.py +++ b/src/abulafia/task_specs/task_specs.py @@ -120,7 +120,7 @@ class ImageSegmentation(CrowdsourcingTask): """ This is a class for image segmentation tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the ImageSegmentation class, which inherits attributes and methods from the superclass CrowdsourcingTask. @@ -219,7 +219,7 @@ class AddOutlines(CrowdsourcingTask): """ This is a class for tasks that add more bounding boxes to images with pre-existing labelled bounding boxes. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the AddOutlines class, which inherits attributes and methods from the superclass CrowdsourcingTask. @@ -344,7 +344,7 @@ class SegmentationClassification(CrowdsourcingTask): """ This is a class for binary segmentation classification tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the SegmentationClassification class, which inherits attributes and methods from the superclass Task. @@ -469,7 +469,7 @@ class LabelledSegmentationVerification(CrowdsourcingTask): """ This is a class for binary segmentation verification tasks with labelled bounding boxes. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the LabelledSegmentationVerification class, which inherits attributes and methods from the superclass Task. @@ -617,7 +617,7 @@ class FixImageSegmentation(CrowdsourcingTask): This is a class for fixing partially correct image segmentation tasks: modifying existing outlines and/or creating new ones. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the FixImageSegmentation class, which inherits attributes and methods from the superclass Task. @@ -715,7 +715,7 @@ class SegmentationVerification(CrowdsourcingTask): """ This is a class for binary image segmentation verification tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the SegmentationVerification class, which inherits attributes and methods from the superclass Task. @@ -833,7 +833,7 @@ class MulticlassVerification(CrowdsourcingTask): """ This is a class for multiclass image segmentation verification tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the MulticlassVerification class, which inherits attributes and methods from the superclass Task. @@ -953,7 +953,7 @@ class TextClassification(CrowdsourcingTask): This is a class for text classification tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the TextClassification class, which inherits attributes and methods from the superclass CrowdsourcingTask. @@ -1059,7 +1059,7 @@ class TextAnnotation(CrowdsourcingTask): This is a class for text annotation tasks. """ - def __init__(self, configuration, client): + def __init__(self, configuration, client, **kwargs): """ This function initialises the TextAnnotation class, which inherits attributes and methods from the superclass CrowdsourcingTask.