Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat:add dataset #6

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# The gensentence.py script
# Run as python3.10 genstence.py > /home/a30user/datasets/passages_eng_200.jsonl

import random
import json
import sys

random.seed(42) # For reproducibility

# Parameters for the dataset
num_samples = 100 # Number of samples to generate
mean_tokens = 20 # Target mean token size
std_dev = 0 # Standard deviation for token length

# Predefined sentence templates with placeholders
sentence_templates = [
"The {noun} {verb} over the {adjective} {noun}.",
"A {adjective} {noun} {verb} quickly through the {noun}.",
"She {verb} the {adjective} {noun} with great {noun}.",
"In the {noun}, the {adjective} {noun} {verb} slowly.",
"They {verb} the {noun} before {verb} to the {noun}.",
"The {adjective} {noun} {verb} near the {adjective} {noun}.",
]

# Word lists
nouns = ["cat", "dog", "car", "tree", "house", "book", "river", "sky", "mountain", "ocean"]
verbs = ["jumps", "runs", "flies", "writes", "builds", "grows", "flows", "sings", "paints", "drives"]
adjectives = ["blue", "happy", "quick", "bright", "dark", "calm", "tall", "strong", "beautiful", "silent"]

# Function to generate a meaningful sentence based on the templates
def generate_meaningful_sentence():
template = random.choice(sentence_templates)
return template.format(
noun=random.choice(nouns),
verb=random.choice(verbs),
adjective=random.choice(adjectives)
)

# Function to generate random text with a given token length
def generate_random_text(token_length):
sentence = []
total_tokens = 0

while total_tokens < token_length:
current_sentence = generate_meaningful_sentence()
sentence_tokens = len(current_sentence.split())
if total_tokens + sentence_tokens > token_length:
break
sentence.append(current_sentence)
total_tokens += sentence_tokens

return ' '.join(sentence)

# Old code being commented out
# sentence = []
# while len(sentence) < token_length:
# sentence.append(generate_meaningful_sentence())
# return ' '.join(sentence)[:token_length]


# Generate the dataset
dataset = []
for _ in range(num_samples):
# Randomly determine the token length based on a normal distribution
token_length = int(random.gauss(mean_tokens, std_dev))
token_length = max(1, token_length) # Ensure token length is positive
text = generate_random_text(token_length)
text = " ".join([text, "(Tokens:", str(len(text.split())),")"])

# Append to the dataset as a JSON object (assuming text field)
dataset.append({"text": text})

# Save the dataset to a .jsonl file
file_name="sample_data.jsonl";
if (len(sys.argv) > 1):
file_name=sys.argv[1];

with open(file_name, "w") as f:
for data in dataset:
f.write(json.dumps(data) + "\n")

print(f"Generated {num_samples} samples with mean token size {mean_tokens} and standard deviation {std_dev}.")


100 changes: 100 additions & 0 deletions query_20.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{"text": "She writes the quick tree with great tree. The happy dog paints near the happy dog. (Tokens: 16 )"}
{"text": "The tree writes over the beautiful tree. They paints the cat before paints to the cat. (Tokens: 16 )"}
{"text": "The car flows over the calm car. She writes the calm car with great car. (Tokens: 15 )"}
{"text": "She drives the dark book with great book. The sky paints over the happy sky. (Tokens: 15 )"}
{"text": "She writes the happy ocean with great ocean. The tree builds over the happy tree. (Tokens: 15 )"}
{"text": "In the book, the calm book flies slowly. She builds the happy tree with great tree. (Tokens: 16 )"}
{"text": "The calm mountain writes near the calm mountain. The tree jumps over the calm tree. (Tokens: 15 )"}
{"text": "They writes the book before writes to the book. In the sky, the dark sky flies slowly. (Tokens: 17 )"}
{"text": "They grows the river before grows to the river. A happy mountain sings quickly through the mountain. (Tokens: 17 )"}
{"text": "The happy river drives near the happy river. In the river, the strong river drives slowly. (Tokens: 16 )"}
{"text": "They grows the house before grows to the house. She flies the strong river with great river. (Tokens: 17 )"}
{"text": "They builds the dog before builds to the dog. They flies the tree before flies to the tree. (Tokens: 18 )"}
{"text": "The book builds over the bright book. The tree drives over the happy tree. (Tokens: 14 )"}
{"text": "A beautiful car sings quickly through the car. A silent house paints quickly through the house. (Tokens: 16 )"}
{"text": "The beautiful book sings near the beautiful book. In the dog, the bright dog writes slowly. (Tokens: 16 )"}
{"text": "They drives the tree before drives to the tree. The dog jumps over the bright dog. (Tokens: 16 )"}
{"text": "In the tree, the quick tree paints slowly. The strong ocean drives near the strong ocean. (Tokens: 16 )"}
{"text": "The dog flows over the calm dog. In the river, the blue river sings slowly. (Tokens: 15 )"}
{"text": "The tree writes over the bright tree. They flies the sky before flies to the sky. (Tokens: 16 )"}
{"text": "The sky paints over the happy sky. The mountain jumps over the happy mountain. (Tokens: 14 )"}
{"text": "The car flows over the blue car. In the house, the dark house sings slowly. (Tokens: 15 )"}
{"text": "A blue house writes quickly through the house. They jumps the mountain before jumps to the mountain. (Tokens: 17 )"}
{"text": "A happy cat paints quickly through the cat. A happy dog drives quickly through the dog. (Tokens: 16 )"}
{"text": "They drives the tree before drives to the tree. The ocean runs over the tall ocean. (Tokens: 16 )"}
{"text": "The dark book writes near the dark book. In the car, the strong car builds slowly. (Tokens: 16 )"}
{"text": "They runs the ocean before runs to the ocean. They paints the tree before paints to the tree. (Tokens: 18 )"}
{"text": "They drives the house before drives to the house. The mountain builds over the happy mountain. (Tokens: 16 )"}
{"text": "The dark mountain flies near the dark mountain. She writes the calm ocean with great ocean. (Tokens: 16 )"}
{"text": "The dog flows over the dark dog. The cat grows over the quick cat. (Tokens: 14 )"}
{"text": "They paints the river before paints to the river. The dog flies over the beautiful dog. (Tokens: 16 )"}
{"text": "She jumps the calm book with great book. A calm tree runs quickly through the tree. (Tokens: 16 )"}
{"text": "A tall car flies quickly through the car. The car grows over the tall car. (Tokens: 15 )"}
{"text": "The sky writes over the bright sky. In the book, the bright book builds slowly. (Tokens: 15 )"}
{"text": "She runs the dark house with great house. She flows the beautiful mountain with great mountain. (Tokens: 16 )"}
{"text": "She runs the silent cat with great cat. In the book, the tall book grows slowly. (Tokens: 16 )"}
{"text": "They builds the tree before builds to the tree. The beautiful river jumps near the beautiful river. (Tokens: 17 )"}
{"text": "She grows the happy ocean with great ocean. The dark house paints near the dark house. (Tokens: 16 )"}
{"text": "The quick house paints near the quick house. A quick river flows quickly through the river. (Tokens: 16 )"}
{"text": "She flows the silent tree with great tree. They sings the book before sings to the book. (Tokens: 17 )"}
{"text": "The dark car runs near the dark car. They grows the ocean before grows to the ocean. (Tokens: 17 )"}
{"text": "In the ocean, the strong ocean runs slowly. In the ocean, the tall ocean writes slowly. (Tokens: 16 )"}
{"text": "The tall cat runs near the tall cat. A strong car paints quickly through the car. (Tokens: 16 )"}
{"text": "The silent mountain paints near the silent mountain. She drives the beautiful sky with great sky. (Tokens: 16 )"}
{"text": "The dark sky sings near the dark sky. A strong house paints quickly through the house. (Tokens: 16 )"}
{"text": "She grows the beautiful book with great book. The car flies over the bright car. (Tokens: 15 )"}
{"text": "In the river, the beautiful river grows slowly. In the river, the bright river jumps slowly. (Tokens: 16 )"}
{"text": "In the sky, the calm sky jumps slowly. She flows the beautiful river with great river. (Tokens: 16 )"}
{"text": "In the tree, the tall tree builds slowly. In the cat, the calm cat flows slowly. (Tokens: 16 )"}
{"text": "They flows the cat before flows to the cat. They runs the cat before runs to the cat. (Tokens: 18 )"}
{"text": "She grows the bright river with great river. In the book, the tall book grows slowly. (Tokens: 16 )"}
{"text": "The book writes over the happy book. The bright cat jumps near the bright cat. (Tokens: 15 )"}
{"text": "A happy car sings quickly through the car. They sings the tree before sings to the tree. (Tokens: 17 )"}
{"text": "A silent house runs quickly through the house. The house drives over the tall house. (Tokens: 15 )"}
{"text": "The dark tree runs near the dark tree. The silent ocean runs near the silent ocean. (Tokens: 16 )"}
{"text": "The tall book jumps near the tall book. In the dog, the calm dog flows slowly. (Tokens: 16 )"}
{"text": "A silent mountain builds quickly through the mountain. They sings the sky before sings to the sky. (Tokens: 17 )"}
{"text": "The house sings over the bright house. In the ocean, the tall ocean drives slowly. (Tokens: 15 )"}
{"text": "A calm sky writes quickly through the sky. She builds the silent book with great book. (Tokens: 16 )"}
{"text": "A beautiful river sings quickly through the river. A strong sky sings quickly through the sky. (Tokens: 16 )"}
{"text": "In the tree, the silent tree builds slowly. She paints the beautiful sky with great sky. (Tokens: 16 )"}
{"text": "She writes the happy house with great house. The happy tree grows near the happy tree. (Tokens: 16 )"}
{"text": "A silent sky builds quickly through the sky. They builds the ocean before builds to the ocean. (Tokens: 17 )"}
{"text": "They builds the car before builds to the car. The mountain builds over the quick mountain. (Tokens: 16 )"}
{"text": "They sings the house before sings to the house. In the book, the blue book flies slowly. (Tokens: 17 )"}
{"text": "The quick cat flies near the quick cat. They runs the house before runs to the house. (Tokens: 17 )"}
{"text": "They writes the ocean before writes to the ocean. In the sky, the dark sky sings slowly. (Tokens: 17 )"}
{"text": "The bright dog writes near the bright dog. She flies the bright dog with great dog. (Tokens: 16 )"}
{"text": "The river sings over the silent river. In the house, the bright house jumps slowly. (Tokens: 15 )"}
{"text": "The tall ocean writes near the tall ocean. The mountain writes over the quick mountain. (Tokens: 15 )"}
{"text": "A silent house drives quickly through the house. She runs the strong sky with great sky. (Tokens: 16 )"}
{"text": "The ocean jumps over the tall ocean. The dark book drives near the dark book. (Tokens: 15 )"}
{"text": "They builds the cat before builds to the cat. The car sings over the beautiful car. (Tokens: 16 )"}
{"text": "In the dog, the calm dog sings slowly. In the book, the happy book grows slowly. (Tokens: 16 )"}
{"text": "She paints the blue river with great river. In the dog, the dark dog grows slowly. (Tokens: 16 )"}
{"text": "They flows the sky before flows to the sky. A silent mountain grows quickly through the mountain. (Tokens: 17 )"}
{"text": "She flies the dark mountain with great mountain. In the sky, the blue sky runs slowly. (Tokens: 16 )"}
{"text": "In the dog, the happy dog writes slowly. In the dog, the strong dog flies slowly. (Tokens: 16 )"}
{"text": "In the sky, the bright sky sings slowly. In the mountain, the tall mountain flies slowly. (Tokens: 16 )"}
{"text": "In the book, the dark book paints slowly. The house builds over the silent house. (Tokens: 15 )"}
{"text": "They grows the sky before grows to the sky. They flows the mountain before flows to the mountain. (Tokens: 18 )"}
{"text": "In the cat, the strong cat grows slowly. The quick river flows near the quick river. (Tokens: 16 )"}
{"text": "They runs the book before runs to the book. The mountain sings over the blue mountain. (Tokens: 16 )"}
{"text": "She drives the tall book with great book. The beautiful dog grows near the beautiful dog. (Tokens: 16 )"}
{"text": "The ocean runs over the bright ocean. The happy house writes near the happy house. (Tokens: 15 )"}
{"text": "The cat grows over the blue cat. She grows the tall book with great book. (Tokens: 15 )"}
{"text": "They flies the car before flies to the car. The ocean flows over the silent ocean. (Tokens: 16 )"}
{"text": "She builds the blue sky with great sky. In the house, the quick house paints slowly. (Tokens: 16 )"}
{"text": "She builds the strong river with great river. She flows the strong tree with great tree. (Tokens: 16 )"}
{"text": "The tall house jumps near the tall house. She drives the blue cat with great cat. (Tokens: 16 )"}
{"text": "They writes the book before writes to the book. They flies the house before flies to the house. (Tokens: 18 )"}
{"text": "A calm dog builds quickly through the dog. The bright river flies near the bright river. (Tokens: 16 )"}
{"text": "They flies the house before flies to the house. In the house, the happy house grows slowly. (Tokens: 17 )"}
{"text": "In the mountain, the happy mountain grows slowly. In the cat, the beautiful cat builds slowly. (Tokens: 16 )"}
{"text": "They grows the river before grows to the river. The blue tree sings near the blue tree. (Tokens: 17 )"}
{"text": "In the house, the happy house flows slowly. A dark cat jumps quickly through the cat. (Tokens: 16 )"}
{"text": "They flows the car before flows to the car. She flows the silent mountain with great mountain. (Tokens: 17 )"}
{"text": "She grows the bright cat with great cat. In the sky, the calm sky writes slowly. (Tokens: 16 )"}
{"text": "The river builds over the bright river. The sky runs over the bright sky. (Tokens: 14 )"}
{"text": "A happy ocean writes quickly through the ocean. They drives the tree before drives to the tree. (Tokens: 17 )"}
{"text": "The house flies over the quick house. They flies the house before flies to the house. (Tokens: 16 )"}