From adcb806a6eaa1ce409483dd83344961ef5eb18a9 Mon Sep 17 00:00:00 2001 From: capjamesg Date: Tue, 9 Jul 2024 01:27:53 +0000 Subject: [PATCH] Update results --- index.html | 165 +++++++++++++++++++--------------------- results/2024-07-09.json | 106 ++++++++++++++++++++++++++ 2 files changed, 183 insertions(+), 88 deletions(-) create mode 100644 results/2024-07-09.json diff --git a/index.html b/index.html index ef42ebd..a08111c 100644 --- a/index.html +++ b/index.html @@ -40,7 +40,7 @@

How's GPT-4o Doing?

You can contribute your own tests, too! See the GitHub README for contributing instructions.

-

Tests are run every day at 1am PT. Last updated July 08, 2024.

+

Tests are run every day at 1am PT. Last updated July 09, 2024.

Made with ❤️ by the team at Roboflow.

@@ -122,7 +122,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
9
+
7

Test submitted by Roboflow

@@ -176,7 +176,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
The words of songs on the album have been echoing in my head all week. "Fades into the grey of my day ok tea."
+
The words of songs on the album have been echoing in my head all week. "Fades into the grey of my day oh tea."

Test submitted by Roboflow

@@ -230,7 +230,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
{'x': 0.519, 'y': 0.373, 'width': 0.172, 'height': 0.282}
+
{'x': 0.41, 'y': 0.45, 'width': 0.28, 'height': 0.16}

Test submitted by Roboflow

@@ -287,19 +287,19 @@

Result

```json
 {
   "A": {
-    "quantity": 15,
+    "quantity": 20,
     "price": 10
   },
   "B": {
-    "quantity": 20,
-    "price": 25
+    "quantity": 25,
+    "price": 20
   },
   "C": {
-    "quantity": 28,
+    "quantity": 30,
     "price": 30
   },
   "D": {
-    "quantity": 30,
+    "quantity": 32,
     "price": 40
   }
 }
@@ -359,9 +359,9 @@ 

Image

Result

```json
 {
-  "R": 79,
-  "G": 0,
-  "B": 128
+  "R": 93,
+  "G": 33,
+  "B": 148
 }
 ```

Test submitted by Roboflow

@@ -403,7 +403,7 @@

Annotation Quality Assurance

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

-

Today's request cost $0.019

+

Today's request cost $0.017

@@ -417,20 +417,9 @@

Prompt

Image

Image of the input into GPT-4

Result

-
To determine if there are any missing annotations, I will count the total number of cars visible in the image and compare this with the number of red bounding boxes. 
-
-Visible cars in the image:
-1. White car on the right
-2. Black car on the left
-3. Multiple cars in the middle of the road (6)
-
-There are 8 cars visible in the image.
-
-Red bounding boxes in the image: There are 7 bounding boxes.
-
-Therefore, there is 1 missing annotation. 
+                                        
Yes, there is a missing annotation in the image. There is a white car on the right side that is not annotated with a red bounding box.
 
-Here is the JSON:
+Here is the JSON representation of the number of missing annotations:
 
 ```json
 {
@@ -441,7 +430,66 @@ 

Result

- + +
+
+
+

Measurement Test

+

Can GPT-4 Measure Items Using a Reference?

+
+
+
+

Fail

+
+
+
+
+
+ Last 7-Day Performance +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

+

Today's request cost $0.009

+
+
+ +
+

Method

+
This test gives GPT-4 a image of a square sticker with a ruler on one side and asks it to provide a number for the length and width. We score this test based on precent error, gauging how far it is from the actual value.
+

Prompt

+
+                                            What is the length and width of this square sticker, based on the ruler? Return a JSON with float properties for `length` and `width` representing inches.
+                                        
+

Image

+ Image of the input into GPT-4 +

Result

+
```json
+{
+  "length": 3.0,
+  "width": 3.0
+}
+```
+

Test submitted by Roboflow

+
+
+
+
@@ -607,7 +655,7 @@

Prompt

Image

Image of the input into GPT-4

Result

-
[{'name': 'MARY THOMAS', 'time_per_day': 1, 'medication': 'ATENOLOL', 'dosage': 100, 'rx_number': '1234567-12345'}]
+
[{'name': 'Mary Thomas', 'time_per_day': 1, 'medication': 'Atenolol', 'dosage': 100, 'rx_number': '1234567-12345'}]

Test submitted by Roboflow

@@ -666,66 +714,7 @@

Result

- -
-
-
-

Measurement Test

-

Can GPT-4 Measure Items Using a Reference?

-
-
-
-

Pass

-
-
-
-
-
- Last 7-Day Performance -
- -
- -
- -
- -
- -
- -
- -
- -
-
-

Of the last 7 tests, conducted daily, this test has passed 0% of the time.

-

Today's request cost $0.009

-
-
- -
-

Method

-
This test gives GPT-4 a image of a square sticker with a ruler on one side and asks it to provide a number for the length and width. We score this test based on precent error, gauging how far it is from the actual value.
-

Prompt

-
-                                            What is the length and width of this square sticker, based on the ruler? Return a JSON with float properties for `length` and `width` representing inches.
-                                        
-

Image

- Image of the input into GPT-4 -

Result

-
```json
-{
-    "length": 3.5,
-    "width": 3.5
-}
-```
-

Test submitted by Roboflow

-
-
-
- +
diff --git a/results/2024-07-09.json b/results/2024-07-09.json new file mode 100644 index 0000000..7ff2130 --- /dev/null +++ b/results/2024-07-09.json @@ -0,0 +1,106 @@ +{ + "zero_shot_classification": { + "score": 1, + "success": true, + "price": 0.00481, + "pass_fail": "Pass", + "response_time": 3.4852826595306396, + "result": "Toyota Camry" + }, + "count_fruit": { + "score": 0, + "success": false, + "price": 0.007870000000000002, + "pass_fail": "Fail", + "response_time": 3.9026782512664795, + "result": "7" + }, + "document_ocr": { + "score": 1, + "success": true, + "price": 0.008539999999999999, + "pass_fail": "Pass", + "response_time": 4.183280944824219, + "result": "I was thinking earlier today that I have gone through, to use the lingo, eras of listening to each of Swift's Eras. Meta indeed. I started listening to Ms. Swift's music after hearing the Midnights album. A few weeks after hearing the album for the first time, I found myself playing various songs on repeat. I listened to the album in order multiple times." + }, + "handwriting_ocr": { + "score": 0, + "success": false, + "price": 0.00876, + "pass_fail": "Fail", + "response_time": 7.1937501430511475, + "result": "The words of songs on the album have been echoing in my head all week. \"Fades into the grey of my day oh tea.\"" + }, + "extraction_ocr": { + "score": 1.0, + "success": true, + "price": 0.00719, + "pass_fail": "Pass", + "response_time": 4.0324647426605225, + "result": "[{'name': 'Mary Thomas', 'time_per_day': 1, 'medication': 'Atenolol', 'dosage': 100, 'rx_number': '1234567-12345'}]" + }, + "math_ocr": { + "score": 1.0, + "success": true, + "price": 0.015290000000000002, + "pass_fail": "Pass", + "response_time": 4.504230260848999, + "result": "3x^2-6x+2" + }, + "object_detection": { + "score": 0.30379746835443044, + "success": false, + "price": 0.009490000000000002, + "pass_fail": "Fail", + "response_time": 3.501730442047119, + "result": "{'x': 0.41, 'y': 0.45, 'width': 0.28, 'height': 0.16}" + }, + "graph_understanding": { + "score": 0.985, + "success": false, + "price": 0.01079, + "pass_fail": "Fail", + "response_time": 2.7269253730773926, + "result": "```json\n{\n \"A\": {\n \"quantity\": 20,\n \"price\": 10\n },\n \"B\": {\n \"quantity\": 25,\n \"price\": 20\n },\n \"C\": {\n \"quantity\": 30,\n \"price\": 30\n },\n \"D\": {\n \"quantity\": 32,\n \"price\": 40\n }\n}\n```" + }, + "color_recognition": { + "score": 0.9333333333333333, + "success": false, + "price": 0.008870000000000001, + "pass_fail": "Fail", + "response_time": 2.907468318939209, + "result": "```json\n{\n \"R\": 93,\n \"G\": 33,\n \"B\": 148\n}\n```" + }, + "annotation_qa": { + "score": 0.33333333333333337, + "success": false, + "price": 0.016560000000000002, + "pass_fail": "Fail", + "response_time": 5.246427774429321, + "result": "Yes, there is a missing annotation in the image. There is a white car on the right side that is not annotated with a red bounding box.\n\nHere is the JSON representation of the number of missing annotations:\n\n```json\n{\n \"missing\": 1\n}\n```" + }, + "measurement": { + "score": 0.8571428571428572, + "success": false, + "price": 0.00877, + "pass_fail": "Fail", + "response_time": 5.811635732650757, + "result": "```json\n{\n \"length\": 3.0,\n \"width\": 3.0\n}\n```" + }, + "easy_captcha": { + "score": 1, + "success": true, + "price": 0.004790000000000001, + "pass_fail": "Pass", + "response_time": 1.3916869163513184, + "result": "charybdis indubitable" + }, + "easy_captcha_persuade": { + "score": 1, + "success": true, + "price": 0.00529, + "pass_fail": "Pass", + "response_time": 1.3407723903656006, + "result": "charybdis indubitable" + } +} \ No newline at end of file