Skip to content

Commit

Permalink
Merge pull request #26 from thiippal/dev
Browse files Browse the repository at this point in the history
Add support for golden tasks
  • Loading branch information
thiippal authored Dec 2, 2022
2 parents fcebb7d + 9c3a8e8 commit 1b2718e
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 22 deletions.
39 changes: 39 additions & 0 deletions examples/config/classify_text_gold.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: classify_text
data:
file: data/classify_text_data_gold.tsv
input:
text: str
output:
result: str
gold:
result: known_answer
options:
positive: Positive
negative: Negative
neutral: Neutral
interface:
prompt: Read the text and assign it to the most appropriate category.
project:
# id: 129368
setup:
public_name: Classify text into categories
public_description: Read the text and assign it to the most appropriate category.
instructions: instructions/classify_text_instructions.html
pool:
# id: 1387049
estimated_time_per_suite: 60
setup:
private_name: Classify text
reward_per_assignment: 0.2
assignment_max_duration_seconds: 600
auto_accept_solutions: true
defaults:
default_overlap_for_new_tasks: 1
default_overlap_for_new_task_suites: 1
mixer:
real_tasks_count: 1
golden_tasks_count: 1
training_tasks_count: 0
filter:
languages:
- EN
5 changes: 5 additions & 0 deletions examples/data/classify_text_data_gold.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
text known_answer
This product is really bad. I returned it immediately and don't recommend it to anyone. negative
I love my new PlayStation 2! It has the best games ever! positive
The customer service in this shop is not the best, but the products are good.
I will never visit this restaurant again!
38 changes: 38 additions & 0 deletions examples/gold_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-

from abulafia.task_specs import TaskSequence, TextClassification
import argparse
import json
import toloka.client as toloka

# Set up the argument parser
ap = argparse.ArgumentParser()

# Add argument for input
ap.add_argument("-c", "--creds", required=True,
help="Path to a JSON file that contains Toloka credentials. "
"The file should have two keys: 'token' and 'mode'. "
"The key 'token' should contain the Toloka API key, whereas "
"the key 'mode' should have the value 'PRODUCTION' or 'SANDBOX' "
"that defines the environment in which the pipeline should be run.")

# Parse the arguments
args = vars(ap.parse_args())

# Assign arguments to variables
cred_file = args['creds']

# Read the credentials from the JSON file
with open(cred_file) as cred_f:

creds = json.loads(cred_f.read())
tclient = toloka.TolokaClient(creds['token'], creds['mode'])

# Create a TextClassification task using the configuration file
classify_text = TextClassification(configuration='config/classify_text_gold.yaml', client=tclient)

# Add the tasks into a TaskSequence
pipe = TaskSequence(sequence=[classify_text], client=tclient)

# Start the task sequence; create tasks on Toloka
pipe.start()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "abulafia"
version = "0.1.7"
version = "0.1.8"
description = "A tool for fair and reproducible crowdsourcing using Toloka"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
65 changes: 53 additions & 12 deletions src/abulafia/functions/core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,62 @@ def create_tasks(input_obj,
# Print status message
msg.info(f'Creating and adding tasks to pool with ID {input_obj.pool.id}')

# Fetch input variable names from the configuration. Create a dictionary with matching
# key and value pairs, which is updated when creating the toloka.Task objects below.
# Fetch input/output variable names from the configuration. Create dictionaries with matching
# key and value pairs, which is updated when creating the Task objects below.
input_values = {n: n for n in list(input_obj.conf['data']['input'].keys())}
output_values = {n: n for n in list(input_obj.conf['data']['output'].keys())}

assert set(input_values.keys()) == set(input_data.columns), raise_error(f"Input data column names "
f"do not match input configuration "
f"for the pool {input_obj.name}!")
assert set(input_values.keys()).issubset(set(input_data.columns)), \
raise_error(f"Could not find the columns defined in the input configuration in the "
f"file with the input data for pool {input_obj.name}!")

# Create a list of Toloka Task objects by looping over the input DataFrame. Use the
# dictionary of input variable names 'input_values' to retrieve the correct columns
# from the DataFrame.
tasks = [toloka.Task(pool_id=input_obj.pool.id,
input_values={k: row[v] for k, v in input_values.items()},
unavailable_for=input_obj.blocklist)
for _, row in input_data.iterrows()]
# Check if golden answers exist
if 'gold' in input_obj.conf['data']:

# Print status message
msg.info(f'Found golden tasks in pool with ID {input_obj.pool.id}')

assert set(input_obj.conf['data']['gold'].keys()).issubset(set(output_values.keys())), \
raise_error(f"Could not find the output column defined in the configuration for "
f"golden answers in pool {input_obj.name}. Remember that the configuration "
f"for golden answers must be given as a key/value pair. The key "
f"names the *output* column for which the golden answers are provided. The "
f"value determines the column that holds the golden answers. For example, "
f"if the golden answers for the column 'result' are stored under column "
f"'gold' in the input file, the key value pair must be 'result: gold'.")

# Get the dictionary for golden answers
gold_dict = input_obj.conf['data']['gold']

# Break the golden answer and normal input data to two DataFrames
gold_data = input_data.dropna(subset=list(gold_dict.values()))
input_data.drop(gold_data.index, axis=0, inplace=True)

# Create golden tasks
tasks = [toloka.Task(pool_id=input_obj.pool.id,
input_values={k: row[v] for k, v in input_values.items()},
known_solutions=[toloka.task.BaseTask.KnownSolution(
output_values={k: str(row[v]) for k, v in gold_dict.items()})],
unavailable_for=input_obj.blocklist
)
for _, row in gold_data.iterrows()]

# Create input tasks
tasks.extend([toloka.Task(pool_id=input_obj.pool.id,
input_values={k: row[v] for k, v in input_values.items()},
unavailable_for=input_obj.blocklist)
for _, row in input_data.iterrows()])

# If no golden answers are provided, continue to create the input tasks
else:

# Create a list of Toloka Task objects by looping over the input DataFrame. Use the
# dictionary of input variable names 'input_values' to retrieve the correct columns
# from the DataFrame.
tasks = [toloka.Task(pool_id=input_obj.pool.id,
input_values={k: row[v] for k, v in input_values.items()},
unavailable_for=input_obj.blocklist)
for _, row in input_data.iterrows()]

return tasks

Expand Down
18 changes: 9 additions & 9 deletions src/abulafia/task_specs/task_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class ImageSegmentation(CrowdsourcingTask):
"""
This is a class for image segmentation tasks.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the ImageSegmentation class, which inherits attributes
and methods from the superclass CrowdsourcingTask.
Expand Down Expand Up @@ -219,7 +219,7 @@ class AddOutlines(CrowdsourcingTask):
"""
This is a class for tasks that add more bounding boxes to images with pre-existing labelled bounding boxes.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the AddOutlines class, which inherits attributes
and methods from the superclass CrowdsourcingTask.
Expand Down Expand Up @@ -344,7 +344,7 @@ class SegmentationClassification(CrowdsourcingTask):
"""
This is a class for binary segmentation classification tasks.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the SegmentationClassification class, which inherits attributes
and methods from the superclass Task.
Expand Down Expand Up @@ -469,7 +469,7 @@ class LabelledSegmentationVerification(CrowdsourcingTask):
"""
This is a class for binary segmentation verification tasks with labelled bounding boxes.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the LabelledSegmentationVerification class, which inherits attributes
and methods from the superclass Task.
Expand Down Expand Up @@ -617,7 +617,7 @@ class FixImageSegmentation(CrowdsourcingTask):
This is a class for fixing partially correct image segmentation tasks: modifying
existing outlines and/or creating new ones.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the FixImageSegmentation class, which inherits attributes
and methods from the superclass Task.
Expand Down Expand Up @@ -715,7 +715,7 @@ class SegmentationVerification(CrowdsourcingTask):
"""
This is a class for binary image segmentation verification tasks.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the SegmentationVerification class, which inherits attributes
and methods from the superclass Task.
Expand Down Expand Up @@ -833,7 +833,7 @@ class MulticlassVerification(CrowdsourcingTask):
"""
This is a class for multiclass image segmentation verification tasks.
"""
def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the MulticlassVerification class, which inherits attributes
and methods from the superclass Task.
Expand Down Expand Up @@ -953,7 +953,7 @@ class TextClassification(CrowdsourcingTask):
This is a class for text classification tasks.
"""

def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the TextClassification class, which inherits attributes
and methods from the superclass CrowdsourcingTask.
Expand Down Expand Up @@ -1059,7 +1059,7 @@ class TextAnnotation(CrowdsourcingTask):
This is a class for text annotation tasks.
"""

def __init__(self, configuration, client):
def __init__(self, configuration, client, **kwargs):
"""
This function initialises the TextAnnotation class, which inherits attributes
and methods from the superclass CrowdsourcingTask.
Expand Down

0 comments on commit 1b2718e

Please sign in to comment.