diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbdc99..6033a75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,35 @@ # Changelog +## v3.8.2 (2025-11-09) + +### New Features + +- **Centralized Error Detection**: Added centralized error detection system with configurable patterns + - New `errors.yaml` configuration file with error patterns + - Supports three pattern types: fuzzy matches, exact matches, and regex matches + - Automatically detects errors in stdout and moves them to error field + - Eliminates need for external applications to parse error messages + - Includes patterns for HTTP 4xx/5xx errors, DNS failures, connection errors, and timeouts + - Added `ErrorDetector` class for pattern-based error detection + +- **Markdown-KV Output Format**: Added `markdownkv` output format optimized for LLM understanding + - Based on research showing 60.7% LLM accuracy vs 44.3% for CSV + - Ideal for RAG pipelines and AI-based systems processing tabular data + - Hierarchical structure with markdown headers and code blocks + - Supported in both local and server modes + - Reference: [Which Table Format Do LLMs Understand Best?](https://www.empiricalagents.com/blog/which-table-format-do-llms-understand-best) + +### Dependencies + +- Added `PyYAML>=5.4.0` for error pattern configuration + +### Testing + +- Added comprehensive test suite for error detection (`tests/test_error_detection.py`) +- Added test suite for Markdown-KV format (`tests/test_markdownkv_format.py`) +- Tests for regex pattern matching, DNS errors, connection errors, and timeouts +- Tests for LLM-friendly data formatting + ## v3.8.1 (2025-06-25) ### Updates diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..ab11a96 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include pystackql/errors.yaml diff --git a/pyproject.toml b/pyproject.toml index 6e28676..49f5698 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pystackql" -version = "3.8.1" +version = "3.8.2" description = "A Python interface for StackQL" readme = "README.rst" authors = [ @@ -31,6 +31,7 @@ dependencies = [ "nest-asyncio>=1.5.5", "termcolor>=1.1.0", "tqdm>=4.61.0", + "PyYAML>=5.4.0", ] [tool.setuptools.packages.find] diff --git a/pystackql/core/error_detector.py b/pystackql/core/error_detector.py new file mode 100644 index 0000000..9bd77ef --- /dev/null +++ b/pystackql/core/error_detector.py @@ -0,0 +1,151 @@ +# pystackql/core/error_detector.py + +""" +Error detection module for PyStackQL. + +This module provides centralized error detection logic that checks messages +against predefined error patterns loaded from errors.yaml. +""" + +import os +import re +import yaml + + +class ErrorDetector: + """Detects errors in query results based on predefined patterns. + + This class loads error patterns from errors.yaml and provides methods + to check if a message contains any of these error patterns. + """ + + def __init__(self): + """Initialize the ErrorDetector by loading error patterns from errors.yaml.""" + self.fuzzy_patterns = [] + self.exact_patterns = [] + self.regex_patterns = [] # List of compiled regex pattern objects + self._load_error_patterns() + + def _load_error_patterns(self): + """Load error patterns from the errors.yaml file. + + The errors.yaml file should be located in the same directory as this module. + """ + # Get the directory containing the pystackql package + current_dir = os.path.dirname(os.path.abspath(__file__)) + package_dir = os.path.dirname(current_dir) + errors_file = os.path.join(package_dir, 'errors.yaml') + + try: + if os.path.exists(errors_file): + with open(errors_file, 'r') as f: + error_config = yaml.safe_load(f) + + if error_config and 'errors' in error_config: + errors = error_config['errors'] + + # Load fuzzy match patterns (case-insensitive substring matching) + if 'fuzzy_matches' in errors: + self.fuzzy_patterns = [ + pattern.lower() + for pattern in errors['fuzzy_matches'] + if pattern + ] + + # Load exact match patterns (case-sensitive exact/prefix matching) + if 'exact_matches' in errors: + self.exact_patterns = [ + pattern + for pattern in errors['exact_matches'] + if pattern + ] + + # Load regex patterns (compile them for efficiency) + if 'regex_matches' in errors: + self.regex_patterns = [] + for pattern in errors['regex_matches']: + if pattern: + try: + # Compile with IGNORECASE flag for case-insensitive matching + compiled = re.compile(pattern, re.IGNORECASE) + self.regex_patterns.append((pattern, compiled)) + except re.error as regex_err: + print(f"Warning: Invalid regex pattern '{pattern}': {regex_err}") + except Exception as e: + # If we can't load the error patterns, continue with empty lists + # This ensures the module doesn't break existing functionality + print(f"Warning: Could not load error patterns from {errors_file}: {e}") + + def is_error(self, message): + """Check if a message contains any error patterns. + + Args: + message (str): The message to check for error patterns + + Returns: + bool: True if the message matches any error pattern, False otherwise + """ + if not message or not isinstance(message, str): + return False + + message_lower = message.lower() + + # Check fuzzy matches (case-insensitive substring matching) + for pattern in self.fuzzy_patterns: + if pattern in message_lower: + return True + + # Check exact matches (exact string or starts with prefix) + for pattern in self.exact_patterns: + if message == pattern or message.startswith(pattern): + return True + + # Check regex matches + for pattern_str, compiled_pattern in self.regex_patterns: + if compiled_pattern.search(message): + return True + + return False + + def extract_error_info(self, message): + """Extract error information from a message. + + Args: + message (str): The error message + + Returns: + dict: Dictionary containing error details with 'error' and 'detected_pattern' keys + """ + if not self.is_error(message): + return None + + message_lower = message.lower() + detected_pattern = None + pattern_type = None + + # Find which pattern was matched (check in order: fuzzy, exact, regex) + for pattern in self.fuzzy_patterns: + if pattern in message_lower: + detected_pattern = pattern + pattern_type = "fuzzy" + break + + if not detected_pattern: + for pattern in self.exact_patterns: + if message == pattern or message.startswith(pattern): + detected_pattern = pattern + pattern_type = "exact" + break + + if not detected_pattern: + for pattern_str, compiled_pattern in self.regex_patterns: + if compiled_pattern.search(message): + detected_pattern = pattern_str + pattern_type = "regex" + break + + return { + "error": message, + "detected_pattern": detected_pattern, + "pattern_type": pattern_type + } diff --git a/pystackql/core/output.py b/pystackql/core/output.py index 9227c27..79b1213 100644 --- a/pystackql/core/output.py +++ b/pystackql/core/output.py @@ -8,6 +8,7 @@ import json from io import StringIO +from .error_detector import ErrorDetector class OutputFormatter: """Formats query results into different output formats. @@ -18,18 +19,19 @@ class OutputFormatter: def __init__(self, output_format='dict'): """Initialize the OutputFormatter. - + Args: output_format (str, optional): The output format. Defaults to 'dict'. Allowed values: 'dict', 'pandas', 'csv' - + Raises: ValueError: If an invalid output format is specified """ - ALLOWED_OUTPUTS = {'dict', 'pandas', 'csv'} + ALLOWED_OUTPUTS = {'dict', 'pandas', 'csv', 'markdownkv'} if output_format.lower() not in ALLOWED_OUTPUTS: raise ValueError(f"Invalid output format. Expected one of {ALLOWED_OUTPUTS}, got {output_format}.") self.output_format = output_format.lower() + self.error_detector = ErrorDetector() def format_query_result(self, result, suppress_errors=True): """Format a query result. @@ -61,10 +63,10 @@ def format_query_result(self, result, suppress_errors=True): def _format_exception(self, exception_msg): """Format an exception message. - + Args: exception_msg (str): The exception message - + Returns: The formatted exception in the specified output format """ @@ -73,15 +75,17 @@ def _format_exception(self, exception_msg): return pd.DataFrame({'error': [exception_msg]}) if exception_msg else pd.DataFrame({'error': []}) elif self.output_format == 'csv': return exception_msg + elif self.output_format == 'markdownkv': + return self._format_markdownkv_error(exception_msg) else: # dict return [{"error": exception_msg}] def _format_error(self, error_msg): """Format an error message. - + Args: error_msg (str): The error message - + Returns: The formatted error in the specified output format """ @@ -90,26 +94,44 @@ def _format_error(self, error_msg): return pd.DataFrame({'error': [error_msg]}) if error_msg else pd.DataFrame({'error': []}) elif self.output_format == 'csv': return error_msg + elif self.output_format == 'markdownkv': + return self._format_markdownkv_error(error_msg) else: # dict return [{"error": error_msg}] def _format_data(self, data): """Format data. - + This method processes SQL type objects from StackQL: - SQL NULL values: {'String': '', 'Valid': False} → None - Regular values: {'String': 'value', 'Valid': True} → 'value' - Empty strings: {'String': '', 'Valid': True} → '' (preserved as empty string) - + + Additionally, this method checks for error patterns in the data and + converts them to proper error responses. + Args: data (str): The data string - + Returns: The formatted data in the specified output format """ if self.output_format == 'csv': + # For CSV, check if the raw data contains error patterns + if self.error_detector.is_error(data): + return data # Return the error message as-is for CSV return data - + + if self.output_format == 'markdownkv': + # For markdownkv, check for errors before parsing + if isinstance(data, str) and self.error_detector.is_error(data): + return self._format_markdownkv_error(data) + + # Check if the raw data string itself is an error message (before JSON parsing) + if isinstance(data, str) and self.error_detector.is_error(data): + # The entire response is an error message + return self._format_error(data) + try: # Attempt to parse JSON first raw_json_data = json.loads(data) @@ -129,19 +151,27 @@ def _format_data(self, data): try: # Process the JSON data to clean up SQL type objects processed_json_data = self._process_sql_types(raw_json_data) - + # Handle empty data if not processed_json_data: return pd.DataFrame() if self.output_format == 'pandas' else [] - + + # Check if the processed data contains error patterns + # This handles cases where StackQL returns error messages in structured data + detected_error = self._check_data_for_errors(processed_json_data) + if detected_error: + return self._format_error(detected_error) + if self.output_format == 'pandas': import pandas as pd # Convert the preprocessed JSON data to a DataFrame return pd.DataFrame(processed_json_data) - + elif self.output_format == 'markdownkv': + return self._format_markdownkv(processed_json_data) + # Return the preprocessed dictionary data return processed_json_data - + except Exception as e: # Handle any errors during processing error_msg = f"Error processing data: {str(e)}" @@ -149,7 +179,44 @@ def _format_data(self, data): import pandas as pd return pd.DataFrame([{"error": error_msg}]) return [{"error": error_msg}] - + + def _check_data_for_errors(self, data): + """Check if processed data contains error patterns. + + This method recursively checks all string values in the data structure + to detect error patterns that might have been returned as valid data. + + Args: + data: The processed data (list, dict, or primitive type) + + Returns: + str: The error message if an error pattern is detected, None otherwise + """ + if isinstance(data, list): + # Check each item in the list + for item in data: + error = self._check_data_for_errors(item) + if error: + return error + + elif isinstance(data, dict): + # Check each value in the dictionary + for key, value in data.items(): + # Check string values for error patterns + if isinstance(value, str) and self.error_detector.is_error(value): + return value + # Recursively check nested structures + error = self._check_data_for_errors(value) + if error: + return error + + elif isinstance(data, str): + # Check if the string itself is an error + if self.error_detector.is_error(data): + return data + + return None + def _process_sql_types(self, data): """Process SQL type objects in the data. @@ -189,7 +256,7 @@ def _process_sql_types(self, data): def _format_empty(self): """Format an empty result. - + Returns: An empty result in the specified output format """ @@ -198,15 +265,71 @@ def _format_empty(self): return pd.DataFrame() elif self.output_format == 'csv': return "" + elif self.output_format == 'markdownkv': + return "# Query Results\n\nNo records found.\n" else: # dict return [] + + def _format_markdownkv(self, data): + """Format data as Markdown Key-Value pairs. + + This format is optimized for LLM understanding based on research showing + it achieves 60.7% accuracy vs 44.3% for CSV when LLMs process tabular data. + + Args: + data: The processed data (list of dicts) + + Returns: + str: Markdown-formatted key-value representation + """ + if not data: + return "# Query Results\n\nNo records found.\n" + + # Handle single dict (convert to list for consistency) + if isinstance(data, dict): + data = [data] + + output = ["# Query Results\n"] + + for idx, record in enumerate(data, 1): + output.append(f"## Record {idx}\n") + output.append("```") + + # Format each key-value pair + for key, value in record.items(): + # Handle None values + if value is None: + value = "null" + output.append(f"{key}: {value}") + + output.append("```\n") + + return "\n".join(output) + + def _format_markdownkv_error(self, error_msg): + """Format an error message in Markdown-KV style. + + Args: + error_msg (str): The error message + + Returns: + str: Markdown-formatted error + """ + return f"""# Query Results + +## Error + +``` +error: {error_msg} +``` +""" def format_statement_result(self, result): """Format a statement result. - + Args: result (dict): The raw statement result from the executor - + Returns: The formatted result in the specified output format """ @@ -214,14 +337,42 @@ def format_statement_result(self, result): if "exception" in result: exception_msg = result["exception"] return self._format_exception(exception_msg) - + # Message on stderr or empty message message = result.get("error", "") - + + # Check if the message contains error patterns + if message and self.error_detector.is_error(message): + # Return as error instead of as a regular message + return self._format_error(message) + if self.output_format == 'pandas': import pandas as pd return pd.DataFrame({'message': [message]}) if message else pd.DataFrame({'message': []}) elif self.output_format == 'csv': return message + elif self.output_format == 'markdownkv': + return self._format_markdownkv_statement(message) else: # dict - return {'message': message.rstrip('\n')} \ No newline at end of file + return {'message': message.rstrip('\n')} + + def _format_markdownkv_statement(self, message): + """Format a statement result message in Markdown-KV style. + + Args: + message (str): The statement result message + + Returns: + str: Markdown-formatted statement result + """ + if not message: + return "# Statement Result\n\nNo message returned.\n" + + return f"""# Statement Result + +## Result + +``` +message: {message.rstrip()} +``` +""" \ No newline at end of file diff --git a/pystackql/core/stackql.py b/pystackql/core/stackql.py index 2606c3c..ea73f59 100644 --- a/pystackql/core/stackql.py +++ b/pystackql/core/stackql.py @@ -310,6 +310,12 @@ def executeStmt(self, query, custom_auth=None, env_vars=None, **kwargs): elif output_format == 'csv': # Return the string representation of the result return result[0]['message'] + elif output_format == 'markdownkv': + from .output import OutputFormatter + temp_formatter = OutputFormatter('markdownkv') + # Extract message from result + message = result[0].get('message', '') if result else '' + return temp_formatter._format_markdownkv_statement(message) else: return result else: @@ -392,6 +398,10 @@ def execute(self, query, suppress_errors=True, custom_auth=None, env_vars=None, return pd.read_json(StringIO(json_str)) elif output_format == 'csv': raise ValueError("CSV output is not supported in server_mode.") + elif output_format == 'markdownkv': + from .output import OutputFormatter + temp_formatter = OutputFormatter('markdownkv') + return temp_formatter._format_markdownkv(result) else: # Assume 'dict' output return result else: diff --git a/pystackql/errors.yaml b/pystackql/errors.yaml new file mode 100644 index 0000000..64b4e9e --- /dev/null +++ b/pystackql/errors.yaml @@ -0,0 +1,52 @@ +# Error patterns for centralized error detection in PyStackQL +# +# This file defines patterns that should be detected as errors when they appear +# in query results. These patterns are checked against messages returned in stdout +# to identify error conditions that would otherwise be treated as valid data. +# +# Pattern Types: +# - fuzzy_matches: Substring matching (case-insensitive) +# - exact_matches: Exact string matching (case-sensitive) +# - regex_matches: Regular expression matching (for complex patterns with variable parts) + +errors: + # Fuzzy matches - will match if the pattern appears anywhere in the message + fuzzy_matches: + # HTTP error status codes (4xx client errors, 5xx server errors) + - "http response status code: 4" + - "http response status code: 5" + + # StackQL-specific error patterns from stackql-deploy + - "disparity in fields" + - "cannot find matching operation" + + # Additional StackQL error patterns + - "invalid query" + - "syntax error" + + # Exact matches - must match the entire message or start with this prefix + exact_matches: + - "error:" + - "ERROR:" + - "Error:" + - "FAILED" + - "FAILURE" + + # Regex matches - regular expressions for complex error patterns + # Use standard Python regex syntax (case-insensitive by default) + regex_matches: + # Network/DNS errors + - 'dial tcp:.*no such host' + - 'Get ".*".*dial tcp.*lookup.*no such host' + + # Connection errors + - 'dial tcp.*connection refused' + - 'unable to connect to.*connection refused' + + # Timeout errors + - 'context deadline exceeded' + - 'timeout.*waiting for' + + # Generic network errors + - 'dial tcp.*i/o timeout' + - 'net/http.*timeout' diff --git a/requirements.txt b/requirements.txt index 8895f3d..5270cf7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ pandas>=1.3.0 requests>=2.25.0 IPython>=7.0.0 termcolor>=1.1.0 +PyYAML>=5.4.0 # Documentation sphinx>=4.0.0 diff --git a/tests/test_error_detection.py b/tests/test_error_detection.py new file mode 100644 index 0000000..b3aa35c --- /dev/null +++ b/tests/test_error_detection.py @@ -0,0 +1,371 @@ +# tests/test_error_detection.py + +""" +Error detection tests for PyStackQL. + +This module tests the centralized error detection functionality that identifies +error patterns in query results. +""" + +import os +import sys +import json +import pytest + +# Add the parent directory to the path so we can import from pystackql +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pystackql.core.error_detector import ErrorDetector +from pystackql.core.output import OutputFormatter + + +class TestErrorDetector: + """Tests for the ErrorDetector class.""" + + def setup_method(self): + """Set up test fixtures.""" + self.detector = ErrorDetector() + + def test_detector_initialization(self): + """Test that ErrorDetector initializes and loads patterns.""" + assert self.detector is not None + assert isinstance(self.detector.fuzzy_patterns, list) + assert isinstance(self.detector.exact_patterns, list) + # Check that some patterns were loaded + assert len(self.detector.fuzzy_patterns) > 0 + assert len(self.detector.exact_patterns) > 0 + + def test_http_4xx_error_detection(self): + """Test detection of HTTP 4xx status codes.""" + messages = [ + "http response status code: 404", + "http response status code: 400, response body: Bad Request", + "HTTP RESPONSE STATUS CODE: 403 Forbidden", + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error in: {msg}" + + def test_http_5xx_error_detection(self): + """Test detection of HTTP 5xx status codes.""" + messages = [ + "http response status code: 500", + "http response status code: 503, service unavailable", + "HTTP RESPONSE STATUS CODE: 502 Bad Gateway", + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error in: {msg}" + + def test_exact_match_detection(self): + """Test detection of exact match patterns.""" + messages = [ + "error: invalid syntax", + "ERROR: something went wrong", + "Error: connection failed", + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error in: {msg}" + + def test_fuzzy_match_detection(self): + """Test detection of fuzzy match patterns.""" + messages = [ + "An error occurred during processing", + "Operation failed due to timeout", + "Cannot find matching operation for this request", + "Disparity in fields to insert and supplied data", + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error in: {msg}" + + def test_non_error_messages(self): + """Test that non-error messages are not detected as errors.""" + messages = [ + "Query executed successfully", + "Retrieved 10 rows", + "Connection established", + "Data retrieved from provider", + ] + for msg in messages: + assert not self.detector.is_error(msg), f"Should not detect error in: {msg}" + + def test_case_insensitive_fuzzy_matching(self): + """Test that fuzzy matching is case-insensitive.""" + messages = [ + "ERROR occurred", + "Error Occurred", + "error occurred", + "An EXCEPTION was raised", + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error in: {msg}" + + def test_extract_error_info(self): + """Test extraction of error information.""" + msg = "http response status code: 404" + info = self.detector.extract_error_info(msg) + assert info is not None + assert "error" in info + assert "detected_pattern" in info + assert info["error"] == msg + assert info["detected_pattern"] is not None + + def test_extract_error_info_non_error(self): + """Test that non-error messages return None.""" + msg = "Success" + info = self.detector.extract_error_info(msg) + assert info is None + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + assert not self.detector.is_error("") + assert not self.detector.is_error(None) + + def test_non_string_handling(self): + """Test handling of non-string inputs.""" + assert not self.detector.is_error(123) + assert not self.detector.is_error([]) + assert not self.detector.is_error({}) + + def test_regex_pattern_loading(self): + """Test that regex patterns are loaded and compiled.""" + assert len(self.detector.regex_patterns) > 0 + # Check that patterns are tuples of (pattern_str, compiled_regex) + for item in self.detector.regex_patterns: + assert isinstance(item, tuple) + assert len(item) == 2 + pattern_str, compiled = item + assert isinstance(pattern_str, str) + # Check it's a compiled regex + assert hasattr(compiled, 'search') + + def test_regex_dns_error_detection(self): + """Test detection of DNS lookup errors using regex.""" + messages = [ + 'Get "https://fred.brew.sh/api/formula/stackql.json?": dial tcp: lookup fred.brew.sh on 8.8.8.8:53: no such host', + 'dial tcp: lookup example.com on 1.1.1.1:53: no such host', + 'Get "http://api.example.com": dial tcp: lookup api.example.com on 192.168.1.1:53: no such host', + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect DNS error in: {msg}" + + def test_regex_connection_refused(self): + """Test detection of connection refused errors using regex.""" + messages = [ + 'dial tcp 192.168.1.1:5432: connection refused', + 'dial tcp [::1]:8080: connection refused', + 'unable to connect to server: connection refused', + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect connection error in: {msg}" + + def test_regex_timeout_errors(self): + """Test detection of timeout errors using regex.""" + messages = [ + 'context deadline exceeded', + 'dial tcp 10.0.0.1:443: i/o timeout', + 'net/http: request canceled while waiting for connection (Client.Timeout exceeded)', + 'timeout while waiting for response', + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect timeout error in: {msg}" + + def test_regex_case_insensitive(self): + """Test that regex matching is case-insensitive.""" + messages = [ + 'DIAL TCP: NO SUCH HOST', + 'Context Deadline Exceeded', + 'Connection Refused', + ] + for msg in messages: + assert self.detector.is_error(msg), f"Should detect error (case-insensitive) in: {msg}" + + def test_extract_error_info_with_regex(self): + """Test error info extraction for regex matches.""" + msg = 'Get "https://example.com": dial tcp: lookup example.com on 8.8.8.8:53: no such host' + info = self.detector.extract_error_info(msg) + assert info is not None + assert info["error"] == msg + assert info["pattern_type"] == "regex" + assert info["detected_pattern"] is not None + # Should match one of the DNS error patterns + assert "no such host" in info["detected_pattern"] + + +class TestOutputFormatterErrorDetection: + """Tests for error detection integration in OutputFormatter.""" + + def setup_method(self): + """Set up test fixtures.""" + self.formatter = OutputFormatter(output_format='dict') + + def test_format_error_in_raw_data(self): + """Test detection of errors in raw data strings.""" + error_data = "http response status code: 404, response body: Not Found" + result = self.formatter._format_data(error_data) + + assert isinstance(result, list) + assert len(result) > 0 + assert "error" in result[0] + + def test_format_error_in_json_data(self): + """Test detection of errors in JSON-formatted data.""" + # Simulate data returned by StackQL with an error message + data = [ + { + "message": "http response status code: 404", + "status": "failed" + } + ] + json_data = json.dumps(data) + result = self.formatter._format_data(json_data) + + assert isinstance(result, list) + assert len(result) > 0 + assert "error" in result[0] + + def test_format_valid_data_not_detected_as_error(self): + """Test that valid data is not detected as error.""" + data = [ + { + "formula_name": "python", + "version": "3.9.0", + "status": "installed" + } + ] + json_data = json.dumps(data) + result = self.formatter._format_data(json_data) + + assert isinstance(result, list) + assert len(result) > 0 + # Should return the data, not an error + if "error" not in result[0]: + assert "formula_name" in result[0] or "version" in result[0] + + def test_check_data_for_errors_in_dict(self): + """Test error detection in dictionary data.""" + data = { + "status": "failed", + "message": "error: operation failed" + } + error = self.formatter._check_data_for_errors(data) + assert error is not None + assert "error" in error.lower() + + def test_check_data_for_errors_in_list(self): + """Test error detection in list data.""" + data = [ + {"name": "test1", "status": "ok"}, + {"name": "test2", "message": "http response status code: 500"} + ] + error = self.formatter._check_data_for_errors(data) + assert error is not None + assert "http response status code" in error.lower() + + def test_check_data_for_errors_nested(self): + """Test error detection in nested data structures.""" + data = { + "results": [ + { + "id": 1, + "details": { + "status": "error: connection timeout" + } + } + ] + } + error = self.formatter._check_data_for_errors(data) + assert error is not None + + def test_check_data_for_errors_no_error(self): + """Test that valid data returns None.""" + data = { + "status": "success", + "results": [ + {"name": "item1", "value": 100}, + {"name": "item2", "value": 200} + ] + } + error = self.formatter._check_data_for_errors(data) + assert error is None + + def test_format_statement_with_error(self): + """Test statement result formatting with error detection.""" + result = { + "error": "http response status code: 404" + } + formatted = self.formatter.format_statement_result(result) + + # Should be formatted as error, not as message + if isinstance(formatted, dict): + # For dict output, check if it's an error list or message + if isinstance(formatted, list): + assert "error" in formatted[0] + elif "error" in formatted: + assert formatted["error"] is not None + elif isinstance(formatted, list): + assert "error" in formatted[0] + + def test_format_statement_without_error(self): + """Test statement result formatting without errors.""" + result = { + "error": "okta provider, version 'v0.5.0' successfully installed" + } + formatted = self.formatter.format_statement_result(result) + + # Should be formatted as message since it's not an error + assert formatted is not None + + +class TestHomebrewProviderErrorScenario: + """Tests for the specific homebrew provider error scenario.""" + + def setup_method(self): + """Set up test fixtures.""" + self.formatter = OutputFormatter(output_format='dict') + self.detector = ErrorDetector() + + def test_homebrew_404_error_detection(self): + """Test detection of homebrew 404 error message.""" + # This is the actual error message from the user's example + error_msg = "http response status code: 404, response body: ..." + + # Should be detected as error + assert self.detector.is_error(error_msg) + + def test_homebrew_404_formatting(self): + """Test formatting of homebrew 404 error.""" + # Simulate the raw data that would come from StackQL + error_data = "http response status code: 404, response body: ..." + + result = self.formatter._format_data(error_data) + + # Should be formatted as error + assert isinstance(result, list) + assert len(result) > 0 + assert "error" in result[0] + assert "404" in str(result[0]["error"]) + + def test_homebrew_valid_formula_not_error(self): + """Test that valid homebrew formula data is not detected as error.""" + # Simulate valid formula data + valid_data = [ + { + "formula_name": "python", + "full_name": "python@3.9", + "homepage": "https://www.python.org", + "latest_version": "3.9.7", + "license": "Python-2.0" + } + ] + json_data = json.dumps(valid_data) + + result = self.formatter._format_data(json_data) + + # Should return data, not error + assert isinstance(result, list) + assert len(result) > 0 + if "error" not in result[0]: + assert "formula_name" in result[0] + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/test_markdownkv_format.py b/tests/test_markdownkv_format.py new file mode 100644 index 0000000..44c5955 --- /dev/null +++ b/tests/test_markdownkv_format.py @@ -0,0 +1,216 @@ +# tests/test_markdownkv_format.py + +""" +Tests for Markdown-KV output format. + +This format is optimized for LLM understanding based on research showing +it achieves 60.7% accuracy vs 44.3% for CSV when LLMs process tabular data. + +Reference: https://www.empiricalagents.com/blog/which-table-format-do-llms-understand-best +""" + +import os +import sys +import pytest + +# Add the parent directory to the path so we can import from pystackql +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pystackql.core.output import OutputFormatter + + +class TestMarkdownKVFormat: + """Tests for Markdown-KV output formatting.""" + + def setup_method(self): + """Set up test fixtures.""" + self.formatter = OutputFormatter(output_format='markdownkv') + + def test_format_initialization(self): + """Test that markdownkv is accepted as a valid output format.""" + assert self.formatter.output_format == 'markdownkv' + + def test_invalid_format_rejected(self): + """Test that invalid formats are rejected.""" + with pytest.raises(ValueError) as exc_info: + OutputFormatter(output_format='invalid') + assert "Invalid output format" in str(exc_info.value) + + def test_format_simple_data(self): + """Test formatting simple data as Markdown-KV.""" + import json + + data = [ + {"id": 1, "name": "Alice", "age": 30}, + {"id": 2, "name": "Bob", "age": 25} + ] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + # Check structure + assert isinstance(result, str) + assert "# Query Results" in result + assert "## Record 1" in result + assert "## Record 2" in result + assert "id: 1" in result + assert "name: Alice" in result + assert "age: 30" in result + assert "id: 2" in result + assert "name: Bob" in result + + def test_format_with_null_values(self): + """Test formatting data with null values.""" + import json + + data = [ + {"id": 1, "name": "Alice", "city": None} + ] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + assert "city: null" in result + + def test_format_empty_data(self): + """Test formatting empty data.""" + result = self.formatter._format_empty() + + assert isinstance(result, str) + assert "# Query Results" in result + assert "No records found" in result + + def test_format_error(self): + """Test formatting error messages.""" + error_msg = "http response status code: 404" + + result = self.formatter._format_markdownkv_error(error_msg) + + assert isinstance(result, str) + assert "# Query Results" in result + assert "## Error" in result + assert "error: http response status code: 404" in result + + def test_format_statement_result(self): + """Test formatting statement results.""" + result = { + "error": "okta provider, version 'v0.5.0' successfully installed" + } + + formatted = self.formatter.format_statement_result(result) + + assert isinstance(formatted, str) + assert "# Statement Result" in formatted + assert "message: okta provider" in formatted + + def test_format_with_code_blocks(self): + """Test that code blocks are properly formatted.""" + import json + + data = [{"id": 1, "name": "Test"}] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + # Count code block markers + assert result.count("```") >= 2 # At least opening and closing + + def test_llm_friendly_structure(self): + """Test that the output follows LLM-friendly Markdown-KV structure.""" + import json + + data = [ + {"employee_id": 1, "department": "Engineering", "salary": 100000} + ] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + # Verify hierarchical structure + lines = result.split('\n') + + # Should have main header + assert any('# Query Results' in line for line in lines) + + # Should have record header + assert any('## Record' in line for line in lines) + + # Should have code block with key: value pairs + assert 'employee_id: 1' in result + assert 'department: Engineering' in result + assert 'salary: 100000' in result + + def test_multiple_records_formatting(self): + """Test formatting multiple records maintains structure.""" + import json + + data = [ + {"id": i, "value": f"test{i}"} + for i in range(1, 6) + ] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + # Should have 5 record sections + for i in range(1, 6): + assert f"## Record {i}" in result + assert f"id: {i}" in result + assert f"value: test{i}" in result + + def test_complex_data_types(self): + """Test handling of various data types.""" + import json + + data = [{ + "string": "test", + "number": 42, + "float": 3.14, + "boolean": True, + "null": None, + "empty_string": "" + }] + json_data = json.dumps(data) + + result = self.formatter._format_data(json_data) + + assert "string: test" in result + assert "number: 42" in result + assert "float: 3.14" in result + assert "boolean: True" in result or "boolean: true" in result.lower() + assert "null: null" in result + assert "empty_string:" in result + + def test_error_detection_integration(self): + """Test that error detection works with markdownkv format.""" + # HTTP error should be detected + error_data = "http response status code: 404, response body: Not Found" + + result = self.formatter._format_data(error_data) + + assert "# Query Results" in result + assert "## Error" in result + assert "404" in result + + +class TestMarkdownKVServerModeCompatibility: + """Tests for markdownkv in server mode scenarios.""" + + def test_server_mode_formatting(self): + """Test that markdownkv can format server mode results.""" + formatter = OutputFormatter(output_format='markdownkv') + + # Simulate server mode result (list of dicts from database) + data = [ + {"formula_name": "python", "version": "3.9.0", "license": "Python-2.0"} + ] + + result = formatter._format_markdownkv(data) + + assert "# Query Results" in result + assert "formula_name: python" in result + assert "version: 3.9.0" in result + + +if __name__ == "__main__": + pytest.main(["-v", __file__])