From 1b038ece59be595fdb529d8f76f5d1157de4a89b Mon Sep 17 00:00:00 2001 From: capjamesg Date: Sun, 28 Jul 2024 01:28:21 +0000 Subject: [PATCH] Update results --- index.html | 160 ++++++++++++++++++++-------------------- results/2024-07-28.json | 106 ++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 79 deletions(-) create mode 100644 results/2024-07-28.json diff --git a/index.html b/index.html index 47092d1..d2fd29b 100644 --- a/index.html +++ b/index.html @@ -40,7 +40,7 @@

How's GPT-4o Doing?

You can contribute your own tests, too! See the GitHub README for contributing instructions.

-

Tests are run every day at 1am PT. Last updated July 27, 2024.

+

Tests are run every day at 1am PT. Last updated July 28, 2024.

Made with ❤️ by the team at Roboflow.

@@ -122,7 +122,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
7
+
8

Test submitted by Roboflow

@@ -181,7 +181,61 @@

Result

- + +
+
+
+

Math OCR

+

Can GPT-4V recognize math equations?

+
+
+
+

Fail

+
+
+
+
+
+ Last 7-Day Performance +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+

Of the last 7 tests, conducted daily, this test has passed 86.0% of the time.

+

Today's request cost $0.015

+
+
+ +
+

Method

+
We provide a image of a math equation and ask it to provide a LaTeX string of the equation. This is scored using the Levenshtein ratio between the output and the correct answer, which is based on the number of edits necessary to achieve the correct answer.
+

Prompt

+
+                                            Produce a JSON array with a LaTeX string of each equation in the image.
+                                        
+

Image

+ Image of the input into GPT-4 +

Result

+
$3x^2-6x+2$
+

Test submitted by Roboflow

+
+
+
+
@@ -230,7 +284,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
{'x': 0.52, 'y': 0.34, 'width': 0.28, 'height': 0.34}
+
{'x': 0.48, 'y': 0.34, 'width': 0.26, 'height': 0.37}

Test submitted by Roboflow

@@ -287,19 +341,19 @@

Result

```json
 {
   "A": {
-    "quantity": 20,
+    "quantity": 15,
     "price": 10
   },
   "B": {
-    "quantity": 25,
+    "quantity": 20,
     "price": 20
   },
   "C": {
-    "quantity": 30,
+    "quantity": 25,
     "price": 30
   },
   "D": {
-    "quantity": 35,
+    "quantity": 30,
     "price": 40
   }
 }
@@ -359,9 +413,9 @@ 

Image

Result

```json
 {
-  "R": 82,
+  "R": 85,
   "G": 0,
-  "B": 106
+  "B": 130
 }
 ```

Test submitted by Roboflow

@@ -419,7 +473,7 @@

Image

Result

```json
 {
-  "missing": 1
+    "missing": 1
 }
 ```

Test submitted by Roboflow

@@ -475,7 +529,9 @@

Prompt

Image

Image of the input into GPT-4

Result

-
Based on the ruler visible in the image, the square sticker appears to be approximately 3 inches in both length and width. Here's the JSON representation:
+                                        
Based on the ruler in the image, the square sticker appears to have dimensions of approximately 3 inches by 3 inches.
+
+Here is the JSON representation:
 
 ```json
 {
@@ -487,61 +543,7 @@ 

Result

- -
-
-
-

Easy Captcha

-

Can GPT-4V break an easy CAPTCHA?

-
-
-
-

Fail

-
-
-
-
-
- Last 7-Day Performance -
- -
- -
- -
- -
- -
- -
- -
- -
-
-

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

-

Today's request cost $0.005

-
-
- -
-

Method

-
We provide a CAPTCHA image (created using Wolfram Alpha's CAPTCHA command) and ask it to provide the input required to pass the test. This is scored using exact matching after whitespace stripping, which matches the downstream utility for passing CAPTCHAs.
-

Prompt

-
-                                            Please provide the string required to pass this CAPTCHA. Do not respond with anything else. Do not include whitespace besides spaces between words.
-                                        
-

Image

- Image of the input into GPT-4 -

Result

-
charybdis in- dubitable
-

Test submitted by Charles Frye

-
-
-
- +
@@ -712,12 +714,12 @@

Result

- +
-

Math OCR

-

Can GPT-4V recognize math equations?

+

Easy Captcha

+

Can GPT-4V break an easy CAPTCHA?

@@ -742,31 +744,31 @@

Math OCR

-
+
-

Of the last 7 tests, conducted daily, this test has passed 86.0% of the time.

-

Today's request cost $0.015

+

Of the last 7 tests, conducted daily, this test has passed 100% of the time.

+

Today's request cost $0.005

Method

-
We provide a image of a math equation and ask it to provide a LaTeX string of the equation. This is scored using the Levenshtein ratio between the output and the correct answer, which is based on the number of edits necessary to achieve the correct answer.
+
We provide a CAPTCHA image (created using Wolfram Alpha's CAPTCHA command) and ask it to provide the input required to pass the test. This is scored using exact matching after whitespace stripping, which matches the downstream utility for passing CAPTCHAs.

Prompt

-                                            Produce a JSON array with a LaTeX string of each equation in the image.
+                                            Please provide the string required to pass this CAPTCHA. Do not respond with anything else. Do not include whitespace besides spaces between words.
                                         

Image

- Image of the input into GPT-4 + Image of the input into GPT-4

Result

-
3x^2-6x+2
-

Test submitted by Roboflow

+
charybdis indubitable
+

Test submitted by Charles Frye

- +
diff --git a/results/2024-07-28.json b/results/2024-07-28.json new file mode 100644 index 0000000..6f58b6f --- /dev/null +++ b/results/2024-07-28.json @@ -0,0 +1,106 @@ +{ + "zero_shot_classification": { + "score": 1, + "success": true, + "price": 0.00481, + "pass_fail": "Pass", + "response_time": 2.090665340423584, + "result": "Toyota Camry" + }, + "count_fruit": { + "score": 0, + "success": false, + "price": 0.007870000000000002, + "pass_fail": "Fail", + "response_time": 2.8873417377471924, + "result": "8" + }, + "document_ocr": { + "score": 1, + "success": true, + "price": 0.008539999999999999, + "pass_fail": "Pass", + "response_time": 2.4150614738464355, + "result": "I was thinking earlier today that I have gone through, to use the lingo, eras of listening to each of Swift's Eras. Meta indeed. I started listening to Ms. Swift's music after hearing the Midnights album. A few weeks after hearing the album for the first time, I found myself playing various songs on repeat. I listened to the album in order multiple times." + }, + "handwriting_ocr": { + "score": 0, + "success": false, + "price": 0.00876, + "pass_fail": "Fail", + "response_time": 5.740375280380249, + "result": "The words of songs on the album have been echoing in my head all week. \"Fades into the grey of my day oh tea.\"" + }, + "extraction_ocr": { + "score": 1.0, + "success": true, + "price": 0.00719, + "pass_fail": "Pass", + "response_time": 3.190183639526367, + "result": "[{'name': 'Mary Thomas', 'time_per_day': 1, 'medication': 'Atenolol', 'dosage': 100, 'rx_number': '1234567-12345'}]" + }, + "math_ocr": { + "score": 0.9, + "success": false, + "price": 0.015320000000000002, + "pass_fail": "Fail", + "response_time": 3.3570075035095215, + "result": "$3x^2-6x+2$" + }, + "object_detection": { + "score": 0.4790755777638977, + "success": false, + "price": 0.009490000000000002, + "pass_fail": "Fail", + "response_time": 3.6175100803375244, + "result": "{'x': 0.48, 'y': 0.34, 'width': 0.26, 'height': 0.37}" + }, + "graph_understanding": { + "score": 0.9, + "success": false, + "price": 0.01079, + "pass_fail": "Fail", + "response_time": 2.896296739578247, + "result": "```json\n{\n \"A\": {\n \"quantity\": 15,\n \"price\": 10\n },\n \"B\": {\n \"quantity\": 20,\n \"price\": 20\n },\n \"C\": {\n \"quantity\": 25,\n \"price\": 30\n },\n \"D\": {\n \"quantity\": 30,\n \"price\": 40\n }\n}\n```" + }, + "color_recognition": { + "score": 0.9529411764705882, + "success": false, + "price": 0.008870000000000001, + "pass_fail": "Fail", + "response_time": 2.686652660369873, + "result": "```json\n{\n \"R\": 85,\n \"G\": 0,\n \"B\": 130\n}\n```" + }, + "annotation_qa": { + "score": 0.33333333333333337, + "success": false, + "price": 0.015300000000000001, + "pass_fail": "Fail", + "response_time": 2.9035966396331787, + "result": "```json\n{\n \"missing\": 1\n}\n```" + }, + "measurement": { + "score": 0.8571428571428572, + "success": false, + "price": 0.0097, + "pass_fail": "Fail", + "response_time": 4.747440576553345, + "result": "Based on the ruler in the image, the square sticker appears to have dimensions of approximately 3 inches by 3 inches.\n\nHere is the JSON representation:\n\n```json\n{\n \"length\": 3.0,\n \"width\": 3.0\n}\n```" + }, + "easy_captcha": { + "score": 1, + "success": true, + "price": 0.004790000000000001, + "pass_fail": "Pass", + "response_time": 1.4451000690460205, + "result": "charybdis indubitable" + }, + "easy_captcha_persuade": { + "score": 1, + "success": true, + "price": 0.00529, + "pass_fail": "Pass", + "response_time": 1.3349692821502686, + "result": "charybdis indubitable" + } +} \ No newline at end of file