diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..d82778c --- /dev/null +++ b/dataset.py @@ -0,0 +1,84 @@ +# The gensentence.py script +# Run as python3.10 genstence.py > /home/a30user/datasets/passages_eng_200.jsonl + +import random +import json +import sys + +random.seed(42) # For reproducibility + +# Parameters for the dataset +num_samples = 100 # Number of samples to generate +mean_tokens = 20 # Target mean token size +std_dev = 0 # Standard deviation for token length + +# Predefined sentence templates with placeholders +sentence_templates = [ + "The {noun} {verb} over the {adjective} {noun}.", + "A {adjective} {noun} {verb} quickly through the {noun}.", + "She {verb} the {adjective} {noun} with great {noun}.", + "In the {noun}, the {adjective} {noun} {verb} slowly.", + "They {verb} the {noun} before {verb} to the {noun}.", + "The {adjective} {noun} {verb} near the {adjective} {noun}.", +] + +# Word lists +nouns = ["cat", "dog", "car", "tree", "house", "book", "river", "sky", "mountain", "ocean"] +verbs = ["jumps", "runs", "flies", "writes", "builds", "grows", "flows", "sings", "paints", "drives"] +adjectives = ["blue", "happy", "quick", "bright", "dark", "calm", "tall", "strong", "beautiful", "silent"] + +# Function to generate a meaningful sentence based on the templates +def generate_meaningful_sentence(): + template = random.choice(sentence_templates) + return template.format( + noun=random.choice(nouns), + verb=random.choice(verbs), + adjective=random.choice(adjectives) + ) + +# Function to generate random text with a given token length +def generate_random_text(token_length): + sentence = [] + total_tokens = 0 + + while total_tokens < token_length: + current_sentence = generate_meaningful_sentence() + sentence_tokens = len(current_sentence.split()) + if total_tokens + sentence_tokens > token_length: + break + sentence.append(current_sentence) + total_tokens += sentence_tokens + + return ' '.join(sentence) + +# Old code being commented out +# sentence = [] +# while len(sentence) < token_length: +# sentence.append(generate_meaningful_sentence()) +# return ' '.join(sentence)[:token_length] + + +# Generate the dataset +dataset = [] +for _ in range(num_samples): + # Randomly determine the token length based on a normal distribution + token_length = int(random.gauss(mean_tokens, std_dev)) + token_length = max(1, token_length) # Ensure token length is positive + text = generate_random_text(token_length) + text = " ".join([text, "(Tokens:", str(len(text.split())),")"]) + + # Append to the dataset as a JSON object (assuming text field) + dataset.append({"text": text}) + +# Save the dataset to a .jsonl file +file_name="sample_data.jsonl"; +if (len(sys.argv) > 1): + file_name=sys.argv[1]; + +with open(file_name, "w") as f: + for data in dataset: + f.write(json.dumps(data) + "\n") + +print(f"Generated {num_samples} samples with mean token size {mean_tokens} and standard deviation {std_dev}.") + + diff --git a/query_20.jsonl b/query_20.jsonl new file mode 100644 index 0000000..112042d --- /dev/null +++ b/query_20.jsonl @@ -0,0 +1,100 @@ +{"text": "She writes the quick tree with great tree. The happy dog paints near the happy dog. (Tokens: 16 )"} +{"text": "The tree writes over the beautiful tree. They paints the cat before paints to the cat. (Tokens: 16 )"} +{"text": "The car flows over the calm car. She writes the calm car with great car. (Tokens: 15 )"} +{"text": "She drives the dark book with great book. The sky paints over the happy sky. (Tokens: 15 )"} +{"text": "She writes the happy ocean with great ocean. The tree builds over the happy tree. (Tokens: 15 )"} +{"text": "In the book, the calm book flies slowly. She builds the happy tree with great tree. (Tokens: 16 )"} +{"text": "The calm mountain writes near the calm mountain. The tree jumps over the calm tree. (Tokens: 15 )"} +{"text": "They writes the book before writes to the book. In the sky, the dark sky flies slowly. (Tokens: 17 )"} +{"text": "They grows the river before grows to the river. A happy mountain sings quickly through the mountain. (Tokens: 17 )"} +{"text": "The happy river drives near the happy river. In the river, the strong river drives slowly. (Tokens: 16 )"} +{"text": "They grows the house before grows to the house. She flies the strong river with great river. (Tokens: 17 )"} +{"text": "They builds the dog before builds to the dog. They flies the tree before flies to the tree. (Tokens: 18 )"} +{"text": "The book builds over the bright book. The tree drives over the happy tree. (Tokens: 14 )"} +{"text": "A beautiful car sings quickly through the car. A silent house paints quickly through the house. (Tokens: 16 )"} +{"text": "The beautiful book sings near the beautiful book. In the dog, the bright dog writes slowly. (Tokens: 16 )"} +{"text": "They drives the tree before drives to the tree. The dog jumps over the bright dog. (Tokens: 16 )"} +{"text": "In the tree, the quick tree paints slowly. The strong ocean drives near the strong ocean. (Tokens: 16 )"} +{"text": "The dog flows over the calm dog. In the river, the blue river sings slowly. (Tokens: 15 )"} +{"text": "The tree writes over the bright tree. They flies the sky before flies to the sky. (Tokens: 16 )"} +{"text": "The sky paints over the happy sky. The mountain jumps over the happy mountain. (Tokens: 14 )"} +{"text": "The car flows over the blue car. In the house, the dark house sings slowly. (Tokens: 15 )"} +{"text": "A blue house writes quickly through the house. They jumps the mountain before jumps to the mountain. (Tokens: 17 )"} +{"text": "A happy cat paints quickly through the cat. A happy dog drives quickly through the dog. (Tokens: 16 )"} +{"text": "They drives the tree before drives to the tree. The ocean runs over the tall ocean. (Tokens: 16 )"} +{"text": "The dark book writes near the dark book. In the car, the strong car builds slowly. (Tokens: 16 )"} +{"text": "They runs the ocean before runs to the ocean. They paints the tree before paints to the tree. (Tokens: 18 )"} +{"text": "They drives the house before drives to the house. The mountain builds over the happy mountain. (Tokens: 16 )"} +{"text": "The dark mountain flies near the dark mountain. She writes the calm ocean with great ocean. (Tokens: 16 )"} +{"text": "The dog flows over the dark dog. The cat grows over the quick cat. (Tokens: 14 )"} +{"text": "They paints the river before paints to the river. The dog flies over the beautiful dog. (Tokens: 16 )"} +{"text": "She jumps the calm book with great book. A calm tree runs quickly through the tree. (Tokens: 16 )"} +{"text": "A tall car flies quickly through the car. The car grows over the tall car. (Tokens: 15 )"} +{"text": "The sky writes over the bright sky. In the book, the bright book builds slowly. (Tokens: 15 )"} +{"text": "She runs the dark house with great house. She flows the beautiful mountain with great mountain. (Tokens: 16 )"} +{"text": "She runs the silent cat with great cat. In the book, the tall book grows slowly. (Tokens: 16 )"} +{"text": "They builds the tree before builds to the tree. The beautiful river jumps near the beautiful river. (Tokens: 17 )"} +{"text": "She grows the happy ocean with great ocean. The dark house paints near the dark house. (Tokens: 16 )"} +{"text": "The quick house paints near the quick house. A quick river flows quickly through the river. (Tokens: 16 )"} +{"text": "She flows the silent tree with great tree. They sings the book before sings to the book. (Tokens: 17 )"} +{"text": "The dark car runs near the dark car. They grows the ocean before grows to the ocean. (Tokens: 17 )"} +{"text": "In the ocean, the strong ocean runs slowly. In the ocean, the tall ocean writes slowly. (Tokens: 16 )"} +{"text": "The tall cat runs near the tall cat. A strong car paints quickly through the car. (Tokens: 16 )"} +{"text": "The silent mountain paints near the silent mountain. She drives the beautiful sky with great sky. (Tokens: 16 )"} +{"text": "The dark sky sings near the dark sky. A strong house paints quickly through the house. (Tokens: 16 )"} +{"text": "She grows the beautiful book with great book. The car flies over the bright car. (Tokens: 15 )"} +{"text": "In the river, the beautiful river grows slowly. In the river, the bright river jumps slowly. (Tokens: 16 )"} +{"text": "In the sky, the calm sky jumps slowly. She flows the beautiful river with great river. (Tokens: 16 )"} +{"text": "In the tree, the tall tree builds slowly. In the cat, the calm cat flows slowly. (Tokens: 16 )"} +{"text": "They flows the cat before flows to the cat. They runs the cat before runs to the cat. (Tokens: 18 )"} +{"text": "She grows the bright river with great river. In the book, the tall book grows slowly. (Tokens: 16 )"} +{"text": "The book writes over the happy book. The bright cat jumps near the bright cat. (Tokens: 15 )"} +{"text": "A happy car sings quickly through the car. They sings the tree before sings to the tree. (Tokens: 17 )"} +{"text": "A silent house runs quickly through the house. The house drives over the tall house. (Tokens: 15 )"} +{"text": "The dark tree runs near the dark tree. The silent ocean runs near the silent ocean. (Tokens: 16 )"} +{"text": "The tall book jumps near the tall book. In the dog, the calm dog flows slowly. (Tokens: 16 )"} +{"text": "A silent mountain builds quickly through the mountain. They sings the sky before sings to the sky. (Tokens: 17 )"} +{"text": "The house sings over the bright house. In the ocean, the tall ocean drives slowly. (Tokens: 15 )"} +{"text": "A calm sky writes quickly through the sky. She builds the silent book with great book. (Tokens: 16 )"} +{"text": "A beautiful river sings quickly through the river. A strong sky sings quickly through the sky. (Tokens: 16 )"} +{"text": "In the tree, the silent tree builds slowly. She paints the beautiful sky with great sky. (Tokens: 16 )"} +{"text": "She writes the happy house with great house. The happy tree grows near the happy tree. (Tokens: 16 )"} +{"text": "A silent sky builds quickly through the sky. They builds the ocean before builds to the ocean. (Tokens: 17 )"} +{"text": "They builds the car before builds to the car. The mountain builds over the quick mountain. (Tokens: 16 )"} +{"text": "They sings the house before sings to the house. In the book, the blue book flies slowly. (Tokens: 17 )"} +{"text": "The quick cat flies near the quick cat. They runs the house before runs to the house. (Tokens: 17 )"} +{"text": "They writes the ocean before writes to the ocean. In the sky, the dark sky sings slowly. (Tokens: 17 )"} +{"text": "The bright dog writes near the bright dog. She flies the bright dog with great dog. (Tokens: 16 )"} +{"text": "The river sings over the silent river. In the house, the bright house jumps slowly. (Tokens: 15 )"} +{"text": "The tall ocean writes near the tall ocean. The mountain writes over the quick mountain. (Tokens: 15 )"} +{"text": "A silent house drives quickly through the house. She runs the strong sky with great sky. (Tokens: 16 )"} +{"text": "The ocean jumps over the tall ocean. The dark book drives near the dark book. (Tokens: 15 )"} +{"text": "They builds the cat before builds to the cat. The car sings over the beautiful car. (Tokens: 16 )"} +{"text": "In the dog, the calm dog sings slowly. In the book, the happy book grows slowly. (Tokens: 16 )"} +{"text": "She paints the blue river with great river. In the dog, the dark dog grows slowly. (Tokens: 16 )"} +{"text": "They flows the sky before flows to the sky. A silent mountain grows quickly through the mountain. (Tokens: 17 )"} +{"text": "She flies the dark mountain with great mountain. In the sky, the blue sky runs slowly. (Tokens: 16 )"} +{"text": "In the dog, the happy dog writes slowly. In the dog, the strong dog flies slowly. (Tokens: 16 )"} +{"text": "In the sky, the bright sky sings slowly. In the mountain, the tall mountain flies slowly. (Tokens: 16 )"} +{"text": "In the book, the dark book paints slowly. The house builds over the silent house. (Tokens: 15 )"} +{"text": "They grows the sky before grows to the sky. They flows the mountain before flows to the mountain. (Tokens: 18 )"} +{"text": "In the cat, the strong cat grows slowly. The quick river flows near the quick river. (Tokens: 16 )"} +{"text": "They runs the book before runs to the book. The mountain sings over the blue mountain. (Tokens: 16 )"} +{"text": "She drives the tall book with great book. The beautiful dog grows near the beautiful dog. (Tokens: 16 )"} +{"text": "The ocean runs over the bright ocean. The happy house writes near the happy house. (Tokens: 15 )"} +{"text": "The cat grows over the blue cat. She grows the tall book with great book. (Tokens: 15 )"} +{"text": "They flies the car before flies to the car. The ocean flows over the silent ocean. (Tokens: 16 )"} +{"text": "She builds the blue sky with great sky. In the house, the quick house paints slowly. (Tokens: 16 )"} +{"text": "She builds the strong river with great river. She flows the strong tree with great tree. (Tokens: 16 )"} +{"text": "The tall house jumps near the tall house. She drives the blue cat with great cat. (Tokens: 16 )"} +{"text": "They writes the book before writes to the book. They flies the house before flies to the house. (Tokens: 18 )"} +{"text": "A calm dog builds quickly through the dog. The bright river flies near the bright river. (Tokens: 16 )"} +{"text": "They flies the house before flies to the house. In the house, the happy house grows slowly. (Tokens: 17 )"} +{"text": "In the mountain, the happy mountain grows slowly. In the cat, the beautiful cat builds slowly. (Tokens: 16 )"} +{"text": "They grows the river before grows to the river. The blue tree sings near the blue tree. (Tokens: 17 )"} +{"text": "In the house, the happy house flows slowly. A dark cat jumps quickly through the cat. (Tokens: 16 )"} +{"text": "They flows the car before flows to the car. She flows the silent mountain with great mountain. (Tokens: 17 )"} +{"text": "She grows the bright cat with great cat. In the sky, the calm sky writes slowly. (Tokens: 16 )"} +{"text": "The river builds over the bright river. The sky runs over the bright sky. (Tokens: 14 )"} +{"text": "A happy ocean writes quickly through the ocean. They drives the tree before drives to the tree. (Tokens: 17 )"} +{"text": "The house flies over the quick house. They flies the house before flies to the house. (Tokens: 16 )"}