Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix url download function #2744

Merged
merged 1 commit into from
Feb 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion g4f/gui/client/static/js/chat.v1.js
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ const handle_ask = async (do_ask_gpt = true) => {
await add_conversation(window.conversation_id);

// Is message a url?
const expression = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
const expression = /^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)$/gi;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change from a global case-insensitive match to a format enforcing regex using ^ and $ is correct for checking if the entire string is a URL. However, the gi flags for global and case-insensitive search remain. The g flag seems unnecessary as the intention appears to be validating a single URL string, not matching multiple occurrences. Consider removing the g flag to simplify the regex.

const regex = new RegExp(expression);
if (message.match(regex)) {
paperclip.classList.add("blink");
Expand Down
6 changes: 3 additions & 3 deletions g4f/tools/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def read_links(html: str, base: str) -> set[str]:
async def download_urls(
bucket_dir: Path,
urls: list[str],
max_depth: int = 1,
max_depth: int = 0,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting max_depth to 0 means there will be no depth limit for URL downloads. Is this intentional? It could potentially lead to an infinite loop or excessive resource usage.

loading_urls: set[str] = set(),
lock: asyncio.Lock = None,
delay: int = 3,
Expand Down Expand Up @@ -515,7 +515,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
if refine_chunks_with_spacy:
for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
if event_stream:
size += len(chunk.decode('utf-8'))
size += len(chunk.encode())

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using encode() instead of decode('utf-8') may change the behavior of this function. Please verify that this change does not cause any unintended issues.

yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
else:
yield chunk
Expand All @@ -524,7 +524,7 @@ def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_wi
streaming = cache_stream(streaming, bucket_dir)
for chunk in streaming:
if event_stream:
size += len(chunk.decode('utf-8'))
size += len(chunk.encode())
yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
else:
yield chunk
Expand Down
4 changes: 2 additions & 2 deletions g4f/tools/web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def scrape_text(html: str, max_words: int = None, add_source=True, count_images:
if select:
select.extract()

image_select = "img[alt][src^=http]:not([alt='']):not(.avatar)"
image_select = "img[alt][src^=http]:not([alt='']):not(.avatar):not([width])"
image_link_select = f"a:has({image_select})"
yield_words = []
for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"):
for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, pre, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"):
if count_images > 0:
image = paragraph.select_one(image_select)
if image:
Expand Down