@@ -114,13 +114,15 @@ def validate_messages(
114114 """Validate the messages column."""
115115 if not isinstance (messages , list ):
116116 raise InvalidFileFormatError (
117- message = "The dataset is malformed, the `messages` column must be a list." ,
117+ message = f"Invalid format on line { idx + 1 } of the input file. "
118+ f"Expected a list of messages. Found { type (messages )} " ,
118119 line_number = idx + 1 ,
119120 error_source = "key_value" ,
120121 )
121122 if not messages :
122123 raise InvalidFileFormatError (
123- message = "The dataset is malformed, the `messages` column must not be empty." ,
124+ message = f"Invalid format on line { idx + 1 } of the input file. "
125+ f"Expected a non-empty list of messages. Found empty list" ,
124126 line_number = idx + 1 ,
125127 error_source = "key_value" ,
126128 )
@@ -132,21 +134,29 @@ def validate_messages(
132134
133135 previous_role = None
134136 for message in messages :
135- if any ( column not in message for column in REQUIRED_COLUMNS_MESSAGE ):
137+ if not isinstance ( message , dict ):
136138 raise InvalidFileFormatError (
137- message = "The dataset is malformed. "
138- "Each message in the messages column must have "
139- f"{ REQUIRED_COLUMNS_MESSAGE } columns." ,
139+ message = f"Invalid format on line { idx + 1 } of the input file. "
140+ f"Expected a dictionary in the messages list. Found { type (message )} " ,
140141 line_number = idx + 1 ,
141142 error_source = "key_value" ,
142143 )
143144 for column in REQUIRED_COLUMNS_MESSAGE :
144- if not isinstance ( message [ column ], str ) :
145+ if column not in message :
145146 raise InvalidFileFormatError (
146- message = f"The dataset is malformed, the column `{ column } ` must be of the string type." ,
147+ message = f"Field `{ column } ` is missing for a turn `{ message } ` on line { idx + 1 } "
148+ "of the the input file." ,
147149 line_number = idx + 1 ,
148150 error_source = "key_value" ,
149151 )
152+ else :
153+ if not isinstance (message [column ], str ):
154+ raise InvalidFileFormatError (
155+ message = f"Invalid format on line { idx + 1 } in the column { column } for turn `{ message } ` "
156+ f"of the input file. Expected string. Found { type (message [column ])} " ,
157+ line_number = idx + 1 ,
158+ error_source = "text_field" ,
159+ )
150160
151161 if has_weights and "weight" in message :
152162 weight = message ["weight" ]
@@ -164,8 +174,8 @@ def validate_messages(
164174 )
165175 if message ["role" ] not in POSSIBLE_ROLES_CONVERSATION :
166176 raise InvalidFileFormatError (
167- message = f"Invalid role { message ['role' ]} in conversation, possible roles: "
168- f"{ ', ' . join ( POSSIBLE_ROLES_CONVERSATION ) } " ,
177+ message = f"Found invalid role ` { message ['role' ]} ` in the messages on the line { idx + 1 } . "
178+ f"Possible roles in the conversation are: { POSSIBLE_ROLES_CONVERSATION } " ,
169179 line_number = idx + 1 ,
170180 error_source = "key_value" ,
171181 )
0 commit comments