diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index dbc9420..4fc7ba6 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.13", "3.14"] + python-version: ["3.13"] fail-fast: false steps: diff --git a/src/hother/streamblocks/core/block_state_machine.py b/src/hother/streamblocks/core/block_state_machine.py index e6413a2..1c208e6 100644 --- a/src/hother/streamblocks/core/block_state_machine.py +++ b/src/hother/streamblocks/core/block_state_machine.py @@ -10,6 +10,8 @@ from hother.streamblocks.core.models import BlockCandidate, ExtractedBlock from hother.streamblocks.core.registry import MetadataValidationFailureMode from hother.streamblocks.core.types import ( + BaseContent, + BaseMetadata, BlockContentDeltaEvent, BlockContentEndEvent, BlockEndEvent, @@ -21,6 +23,8 @@ BlockStartEvent, BlockState, Event, + ParseResult, + SectionType, TextContentEvent, ) from hother.streamblocks.core.utils import get_syntax_name @@ -201,7 +205,11 @@ def _handle_boundary(self, candidate: BlockCandidate, line: str, line_number: in events.append(self._create_section_delta_event(candidate, line, line_number, is_boundary=True)) # Check if metadata section just ended (transition to content) - if self._emit_section_end_events and old_section == "metadata" and candidate.current_section == "content": + if ( + self._emit_section_end_events + and old_section == SectionType.METADATA + and candidate.current_section == SectionType.CONTENT + ): metadata_end_event = self._create_metadata_end_event(candidate, line_number) events.append(metadata_end_event) @@ -337,64 +345,45 @@ def reset(self) -> None: self._candidates.clear() self._block_ids.clear() - def _try_extract_block( - self, - candidate: BlockCandidate, - line_number: int, - ) -> BlockEndEvent | BlockErrorEvent: - """Try to parse and validate a complete block. + def _parse_candidate(self, candidate: BlockCandidate) -> tuple[str | None, ParseResult[BaseMetadata, BaseContent]]: + """Extract block type, look up class, and parse the candidate. Args: - candidate: Block candidate to extract - line_number: Current line number (end of block) + candidate: Block candidate to parse Returns: - BlockEndEvent if successful, BlockErrorEvent if validation fails + Tuple of (block_type, parse_result) """ - # Step 1: Extract block_type from candidate + # Extract block_type from candidate block_type = candidate.syntax.extract_block_type(candidate) - self._logger.debug( - "extracting_block", - syntax=get_syntax_name(candidate.syntax), - block_type=block_type, - start_line=candidate.start_line, - end_line=line_number, - size_bytes=len(candidate.raw_text), - ) - - # Step 2: Look up block_class from registry - block_class = None - if block_type: - block_class = self._registry.get_block_class(block_type) + # Look up block_class from registry + block_class = self._registry.get_block_class(block_type) if block_type else None - # Step 3: Parse with the appropriate block_class + # Parse with the appropriate block_class parse_result = candidate.syntax.parse_block(candidate, block_class) - if not parse_result.success: - error = parse_result.error or "Parse failed" - self._logger.warning( - "block_parse_failed", - block_type=block_type, - error=error, - syntax=get_syntax_name(candidate.syntax), - exc_info=parse_result.exception, - ) - return self._create_error_event( - candidate, line_number, error, BlockErrorCode.PARSE_FAILED, parse_result.exception - ) + return block_type, parse_result - metadata = parse_result.metadata - content = parse_result.content + def _create_extracted_block( + self, + candidate: BlockCandidate, + parse_result: ParseResult[BaseMetadata, BaseContent], + line_number: int, + ) -> ExtractedBlock[BaseMetadata, BaseContent]: + """Create ExtractedBlock from parse result. - if metadata is None or content is None: - error_code = BlockErrorCode.MISSING_METADATA if metadata is None else BlockErrorCode.MISSING_CONTENT - return self._create_error_event(candidate, line_number, "Missing metadata or content", error_code) + Args: + candidate: Block candidate + parse_result: Successful parse result + line_number: Current line number (end of block) - # Create extracted block with metadata, content, and extraction info - block = ExtractedBlock( - metadata=metadata, - content=content, + Returns: + ExtractedBlock with metadata, content, and extraction info + """ + return ExtractedBlock( + metadata=parse_result.metadata, # type: ignore[arg-type] # Checked by caller + content=parse_result.content, # type: ignore[arg-type] # Checked by caller syntax_name=get_syntax_name(candidate.syntax), raw_text=candidate.raw_text, line_start=candidate.start_line, @@ -402,7 +391,25 @@ def _try_extract_block( hash_id=candidate.compute_hash(), ) - # Additional validation from syntax + def _validate_extracted_block( + self, + candidate: BlockCandidate, + block: ExtractedBlock[BaseMetadata, BaseContent], + block_type: str | None, + line_number: int, + ) -> BlockErrorEvent | None: + """Run syntax and registry validation on extracted block. + + Args: + candidate: Block candidate + block: Extracted block to validate + block_type: Block type from candidate + line_number: Current line number + + Returns: + BlockErrorEvent if validation fails, None if passes + """ + # Syntax validation if not candidate.syntax.validate_block(block): self._logger.warning( "syntax_validation_failed", @@ -424,6 +431,24 @@ def _try_extract_block( candidate, line_number, "Registry validation failed", BlockErrorCode.VALIDATION_FAILED ) + return None + + def _create_success_event( + self, + candidate: BlockCandidate, + block: ExtractedBlock[BaseMetadata, BaseContent], + block_type: str | None, + ) -> BlockEndEvent: + """Create BlockEndEvent for successful extraction. + + Args: + candidate: Block candidate + block: Successfully extracted and validated block + block_type: Block type from candidate + + Returns: + BlockEndEvent with all block information + """ self._logger.info( "block_extracted", block_type=block_type, @@ -437,12 +462,12 @@ def _try_extract_block( event = BlockEndEvent( block_id=block_id, - block_type=block_type or metadata.block_type, + block_type=block_type or block.metadata.block_type, syntax=block.syntax_name, start_line=block.line_start, end_line=block.line_end, - metadata=metadata.model_dump(), - content=content.model_dump(), + metadata=block.metadata.model_dump(), + content=block.content.model_dump(), raw_content=block.raw_text, hash_id=block.hash_id, ) @@ -450,6 +475,66 @@ def _try_extract_block( object.__setattr__(event, "_block", block) return event + def _try_extract_block( + self, + candidate: BlockCandidate, + line_number: int, + ) -> BlockEndEvent | BlockErrorEvent: + """Try to parse and validate a complete block. + + This orchestrates the extraction process by delegating to specialized helpers. + + Args: + candidate: Block candidate to extract + line_number: Current line number (end of block) + + Returns: + BlockEndEvent if successful, BlockErrorEvent if validation fails + """ + # Step 1: Parse the candidate + block_type, parse_result = self._parse_candidate(candidate) + + self._logger.debug( + "extracting_block", + syntax=get_syntax_name(candidate.syntax), + block_type=block_type, + start_line=candidate.start_line, + end_line=line_number, + size_bytes=len(candidate.raw_text), + ) + + # Handle parse failure + if not parse_result.success: + error = parse_result.error or "Parse failed" + self._logger.warning( + "block_parse_failed", + block_type=block_type, + error=error, + syntax=get_syntax_name(candidate.syntax), + exc_info=parse_result.exception, + ) + return self._create_error_event( + candidate, line_number, error, BlockErrorCode.PARSE_FAILED, parse_result.exception + ) + + # Check for missing metadata or content + if parse_result.metadata is None or parse_result.content is None: + error_code = ( + BlockErrorCode.MISSING_METADATA if parse_result.metadata is None else BlockErrorCode.MISSING_CONTENT + ) + return self._create_error_event(candidate, line_number, "Missing metadata or content", error_code) + + # Step 2: Create extracted block + block = self._create_extracted_block(candidate, parse_result, line_number) + + # Step 3: Validate + validation_error = self._validate_extracted_block(candidate, block, block_type, line_number) + if validation_error: + return validation_error + + # Step 4: Success! + return self._create_success_event(candidate, block, block_type) + def _create_section_delta_event( self, candidate: BlockCandidate, @@ -470,11 +555,11 @@ def _create_section_delta_event( Section-specific delta event """ block_id = self.get_block_id(candidate.start_line) - section = candidate.current_section or "content" + section = candidate.current_section or SectionType.CONTENT syntax_name = get_syntax_name(candidate.syntax) accumulated_size = len(candidate.raw_text) - if section == "header": + if section == SectionType.HEADER: return BlockHeaderDeltaEvent( block_id=block_id, delta=line, @@ -483,7 +568,7 @@ def _create_section_delta_event( accumulated_size=accumulated_size, inline_metadata=None, ) - if section == "metadata": + if section == SectionType.METADATA: return BlockMetadataDeltaEvent( block_id=block_id, delta=line, @@ -580,9 +665,8 @@ def _create_metadata_end_event( validation_passed = result.passed validation_error = result.error - # Store validation state in candidate - candidate.metadata_validation_passed = validation_passed - candidate.metadata_validation_error = validation_error + # Cache validation state in candidate + candidate.cache_metadata_validation(validation_passed, validation_error) return BlockMetadataEndEvent( block_id=block_id, @@ -638,9 +722,8 @@ def _create_content_end_event( validation_passed = result.passed validation_error = result.error - # Store validation state in candidate - candidate.content_validation_passed = validation_passed - candidate.content_validation_error = validation_error + # Cache validation state in candidate + candidate.cache_content_validation(validation_passed, validation_error) return BlockContentEndEvent( block_id=block_id, diff --git a/src/hother/streamblocks/core/constants.py b/src/hother/streamblocks/core/constants.py new file mode 100644 index 0000000..b10946b --- /dev/null +++ b/src/hother/streamblocks/core/constants.py @@ -0,0 +1,47 @@ +"""Core constants and limits for StreamBlocks. + +This module centralizes all numeric constants and default values used throughout +the library, making them easy to discover and modify if needed. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ProcessingLimits: + """Default limits for stream processing. + + These constants define the default safety limits for processing streams + and extracting blocks. They are used as defaults in ProcessorConfig. + + Attributes: + MAX_BLOCK_SIZE: Maximum block size in bytes (1 MiB). + Blocks exceeding this limit are rejected to prevent memory exhaustion. + MAX_LINE_LENGTH: Maximum line length in bytes (16 KiB). + Lines exceeding this limit are truncated to prevent memory issues. + HASH_PREFIX_LENGTH: Number of characters used for hash computation (64). + Used when generating block IDs from content hash. + LINES_BUFFER: Default number of recent lines to keep in buffer (5). + Used for debugging and error context. + """ + + #: Maximum block size in bytes (1 MiB) + MAX_BLOCK_SIZE: int = 1_048_576 + + #: Maximum line length in bytes (16 KiB) + MAX_LINE_LENGTH: int = 16_384 + + #: Number of characters used for hash computation + HASH_PREFIX_LENGTH: int = 64 + + #: Default number of recent lines to keep in buffer + LINES_BUFFER: int = 5 + + +# Singleton instance for convenient access +LIMITS = ProcessingLimits() + +# Type system constants +EXPECTED_BLOCK_TYPE_PARAMS = 2 # For Block[TMetadata, TContent] diff --git a/src/hother/streamblocks/core/models.py b/src/hother/streamblocks/core/models.py index 0a9422c..3d832fe 100644 --- a/src/hother/streamblocks/core/models.py +++ b/src/hother/streamblocks/core/models.py @@ -7,14 +7,12 @@ from pydantic import BaseModel, Field -from hother.streamblocks.core.types import BaseContent, BaseMetadata, BlockState +from hother.streamblocks.core.constants import EXPECTED_BLOCK_TYPE_PARAMS, LIMITS +from hother.streamblocks.core.types import BaseContent, BaseMetadata, BlockState, SectionType if TYPE_CHECKING: from hother.streamblocks.syntaxes.base import BaseSyntax -# Expected number of type parameters for Block[TMetadata, TContent] -_EXPECTED_BLOCK_TYPE_PARAMS = 2 - def extract_block_types(block_class: type[Any]) -> tuple[type[BaseMetadata], type[BaseContent]]: """Extract metadata and content type parameters from a Block class. @@ -33,7 +31,7 @@ def extract_block_types(block_class: type[Any]) -> tuple[type[BaseMetadata], typ origin = get_origin(base) if origin is not None and origin.__name__ == "Block": args = get_args(base) - if len(args) == _EXPECTED_BLOCK_TYPE_PARAMS: + if len(args) == EXPECTED_BLOCK_TYPE_PARAMS: return args # Fallback: extract from Pydantic field annotations (for non-generic subclasses) @@ -85,7 +83,7 @@ def __init__(self, syntax: BaseSyntax, start_line: int) -> None: self.state = BlockState.HEADER_DETECTED self.metadata_lines: list[str] = [] self.content_lines: list[str] = [] - self.current_section: str = "header" # "header", "metadata", "content" + self.current_section: SectionType = SectionType.HEADER # Cache fields for early parsing results self.parsed_metadata: dict[str, Any] | None = None @@ -101,14 +99,56 @@ def add_line(self, line: str) -> None: """Add a line to the candidate.""" self.lines.append(line) + def transition_to_metadata(self) -> None: + """Transition from header to metadata section. + + This method encapsulates the section state transition logic, + making the state change explicit and centralized. + """ + self.current_section = SectionType.METADATA + + def transition_to_content(self) -> None: + """Transition from metadata/header to content section. + + This method encapsulates the section state transition logic, + making the state change explicit and centralized. + """ + self.current_section = SectionType.CONTENT + + def cache_metadata_validation(self, passed: bool, error: str | None) -> None: + """Cache metadata validation result. + + This method encapsulates validation result storage, providing + a clear interface for the state machine to cache validation state. + + Args: + passed: Whether metadata validation passed + error: Error message if validation failed, None otherwise + """ + self.metadata_validation_passed = passed + self.metadata_validation_error = error + + def cache_content_validation(self, passed: bool, error: str | None) -> None: + """Cache content validation result. + + This method encapsulates validation result storage, providing + a clear interface for the state machine to cache validation state. + + Args: + passed: Whether content validation passed + error: Error message if validation failed, None otherwise + """ + self.content_validation_passed = passed + self.content_validation_error = error + @property def raw_text(self) -> str: """Get the raw text of all accumulated lines.""" return "\n".join(self.lines) def compute_hash(self) -> str: - """Compute hash of first 64 chars for ID.""" - text_slice = self.raw_text[:64] + """Compute hash of first N chars for ID (N defined in constants).""" + text_slice = self.raw_text[: LIMITS.HASH_PREFIX_LENGTH] return hashlib.sha256(text_slice.encode()).hexdigest()[:8] def __repr__(self) -> str: diff --git a/src/hother/streamblocks/core/processor.py b/src/hother/streamblocks/core/processor.py index 1635cf6..c99839c 100644 --- a/src/hother/streamblocks/core/processor.py +++ b/src/hother/streamblocks/core/processor.py @@ -13,6 +13,7 @@ from hother.streamblocks.adapters.protocols import HasNativeModulePrefix from hother.streamblocks.core._logger import StdlibLoggerAdapter from hother.streamblocks.core.block_state_machine import BlockStateMachine +from hother.streamblocks.core.constants import LIMITS from hother.streamblocks.core.line_accumulator import LineAccumulator from hother.streamblocks.core.types import ( BlockContentDeltaEvent, @@ -49,11 +50,44 @@ def duration_ms(self) -> int: @dataclass(frozen=True, slots=True) class ProcessorConfig: - """Configuration for StreamBlockProcessor.""" + """Configuration for StreamBlockProcessor. + + Attributes: + lines_buffer: Number of recent lines to keep in buffer for context (default: 5). + Used for debugging and error messages. + max_line_length: Maximum line length in bytes before truncation (default: 16,384). + Lines exceeding this limit are truncated to prevent memory issues. + max_block_size: Maximum block size in bytes before rejection (default: 1,048,576 = 1MB). + Blocks exceeding this limit are rejected with SIZE_EXCEEDED error. + emit_original_events: Whether to pass through original provider events (default: True). + When False, only StreamBlocks events are emitted. Set to False when using + IdentityInputAdapter to avoid duplicate events. + emit_text_deltas: Whether to emit TextDeltaEvent for real-time streaming (default: True). + Enables character-level streaming for live UIs. Disable to reduce event volume. + emit_section_end_events: Whether to emit section end events (default: True). + Controls BlockMetadataEndEvent and BlockContentEndEvent emission for early validation. + auto_detect_adapter: Whether to auto-detect input adapter from first chunk (default: True). + When False, uses IdentityInputAdapter. Disable for performance with known adapter. + + Example: + >>> # Custom configuration for large blocks + >>> config = ProcessorConfig( + ... max_block_size=2_097_152, # 2MB + ... emit_original_events=False, + ... emit_text_deltas=False, + ... ) + >>> processor = StreamBlockProcessor(registry, config=config) + >>> + >>> # Minimal configuration for performance + >>> config = ProcessorConfig( + ... emit_section_end_events=False, + ... auto_detect_adapter=False, + ... ) + """ - lines_buffer: int = 5 - max_line_length: int = 16_384 - max_block_size: int = 1_048_576 # 1MB + lines_buffer: int = LIMITS.LINES_BUFFER + max_line_length: int = LIMITS.MAX_LINE_LENGTH + max_block_size: int = LIMITS.MAX_BLOCK_SIZE emit_original_events: bool = True emit_text_deltas: bool = True emit_section_end_events: bool = True @@ -142,6 +176,10 @@ def process_chunk( List of events generated from this chunk. May be empty if chunk only accumulates text without completing any lines. + Raises: + RuntimeError: If adapter is not set after first chunk processing + (internal state error, should not occur in normal usage). + Example: >>> processor = StreamBlockProcessor(registry) >>> response = await client.generate_content_stream(...) @@ -316,6 +354,10 @@ async def process_stream( - TextDeltaEvent (if emit_text_deltas=True) - TextContentEvent, BlockStartEvent, BlockEndEvent, BlockErrorEvent, and section delta events + Raises: + RuntimeError: If adapter is not set after first chunk processing + (internal state error, should not occur in normal usage). + Example: >>> # Plain text >>> async for event in processor.process_stream(text_stream): diff --git a/src/hother/streamblocks/core/types.py b/src/hother/streamblocks/core/types.py index 6b0ae61..18398dd 100644 --- a/src/hother/streamblocks/core/types.py +++ b/src/hother/streamblocks/core/types.py @@ -19,7 +19,26 @@ class BaseMetadata(BaseModel): """Base metadata model with standard fields. - All custom metadata models should inherit from this class. + All custom metadata models should inherit from this class and add + their domain-specific fields. + + Example: + >>> # Define custom metadata for a patch block + >>> class PatchMetadata(BaseMetadata): + ... file_path: str + ... operation: Literal["create", "update", "delete"] + ... + >>> # Create instance with required base fields + >>> metadata = PatchMetadata( + ... id="patch_001", + ... block_type="patch", + ... file_path="src/main.py", + ... operation="update" + ... ) + >>> metadata.id + 'patch_001' + >>> metadata.file_path + 'src/main.py' """ id: str = Field(..., description="Block identifier") @@ -29,8 +48,33 @@ class BaseMetadata(BaseModel): class BaseContent(BaseModel): """Base content model with raw content field. - All custom content models should inherit from this class. - The raw_content field always contains the unparsed block content. + All custom content models should inherit from this class and optionally + override the parse() method to add custom parsing logic. The raw_content + field always preserves the original unparsed text. + + Example: + >>> # Simple content model that just stores raw text + >>> class SimpleContent(BaseContent): + ... pass + ... + >>> content = SimpleContent.parse("Hello, world!") + >>> content.raw_content + 'Hello, world!' + >>> + >>> # Content model with custom parsing + >>> class ItemsContent(BaseContent): + ... items: list[str] = [] + ... + ... @classmethod + ... def parse(cls, raw_text: str) -> Self: + ... items = [line.strip() for line in raw_text.split("\\n") if line.strip()] + ... return cls(raw_content=raw_text, items=items) + ... + >>> content = ItemsContent.parse("apple\\nbanana\\norange") + >>> content.items + ['apple', 'banana', 'orange'] + >>> content.raw_content # Original text preserved + 'apple\\nbanana\\norange' """ raw_content: str = Field(..., description="Raw unparsed content from the block") @@ -86,7 +130,40 @@ class BlockState(StrEnum): class BlockErrorCode(StrEnum): - """Standard error codes for BlockErrorEvent.""" + """Standard error codes for BlockErrorEvent. + + These codes categorize why a block extraction failed, enabling + appropriate error handling and recovery strategies. + + Values: + VALIDATION_FAILED: Block failed validator function checks (syntax or registry validation). + Indicates the block structure is valid but business rules were violated. + SIZE_EXCEEDED: Block exceeded max_block_size limit. + Prevents memory exhaustion from maliciously large blocks. + UNCLOSED_BLOCK: Block opened but never closed (stream ended). + Indicates incomplete block at end of stream. + UNKNOWN_TYPE: block_type not registered in registry. + The syntax extracted a type that hasn't been registered. + PARSE_FAILED: Block parsing failed (malformed YAML, invalid structure). + The syntax couldn't parse the block content into metadata/content. + MISSING_METADATA: Required metadata section missing. + Parse succeeded but returned None for metadata. + MISSING_CONTENT: Required content section missing. + Parse succeeded but returned None for content. + SYNTAX_ERROR: Syntax-specific error (custom validation). + Used for syntax-specific validation failures. + + Example: + >>> # Handle different error codes appropriately + >>> async for event in processor.process_stream(stream): + ... if isinstance(event, BlockErrorEvent): + ... if event.error_code == BlockErrorCode.SIZE_EXCEEDED: + ... logger.warning(f"Block too large: {event.reason}") + ... elif event.error_code == BlockErrorCode.VALIDATION_FAILED: + ... logger.error(f"Validation failed: {event.reason}") + ... elif event.error_code == BlockErrorCode.UNCLOSED_BLOCK: + ... logger.info(f"Incomplete block at stream end: {event.reason}") + """ VALIDATION_FAILED = "VALIDATION_FAILED" SIZE_EXCEEDED = "SIZE_EXCEEDED" @@ -98,6 +175,18 @@ class BlockErrorCode(StrEnum): SYNTAX_ERROR = "SYNTAX_ERROR" +class SectionType(StrEnum): + """Block section types during accumulation. + + These represent the different phases of block parsing as content + is accumulated line by line. + """ + + HEADER = "header" # Block opening line(s) with inline metadata + METADATA = "metadata" # Dedicated metadata section (e.g., YAML frontmatter) + CONTENT = "content" # Main block content + + # ============== Base Event ============== diff --git a/src/hother/streamblocks/syntaxes/base.py b/src/hother/streamblocks/syntaxes/base.py index 887406c..c4dce92 100644 --- a/src/hother/streamblocks/syntaxes/base.py +++ b/src/hother/streamblocks/syntaxes/base.py @@ -9,13 +9,12 @@ import yaml from pydantic import ValidationError +# Import BaseMetadata at module level for runtime isinstance checks +from hother.streamblocks.core.types import BaseMetadata, ParseResult + if TYPE_CHECKING: from hother.streamblocks.core.models import BlockCandidate, ExtractedBlock - from hother.streamblocks.core.types import BaseContent, BaseMetadata, DetectionResult - -# Import ParseResult at module level to avoid circular import issues -# This is safe because it's only used for return types -from hother.streamblocks.core.types import ParseResult + from hother.streamblocks.core.types import BaseContent, DetectionResult # Module-level logger for debugging YAML parsing failures _logger = logging.getLogger(__name__) @@ -175,7 +174,34 @@ def parse_block( """ ... - # Helper methods for validation + # Helper methods for metadata and validation + + def _set_default_metadata_fields( + self, + metadata_dict: dict[str, Any], + candidate: BlockCandidate, + metadata_class: type[BaseMetadata], + default_type: str = "unknown", + ) -> None: + """Set default id and block_type if using BaseMetadata. + + This helper method consolidates the common pattern of setting default + values for id and block_type when using the base metadata class. + Custom metadata classes should provide their own defaults. + + Args: + metadata_dict: Metadata dictionary to update with defaults + candidate: The block candidate (used for hash-based ID generation) + metadata_class: The metadata class being used + default_type: Default block_type value if not present (default: "unknown") + """ + if metadata_class is not BaseMetadata: + return + + if "id" not in metadata_dict: + metadata_dict["id"] = f"block_{candidate.compute_hash()}" + if "block_type" not in metadata_dict: + metadata_dict["block_type"] = default_type def _safe_parse_metadata( self, diff --git a/src/hother/streamblocks/syntaxes/delimiter.py b/src/hother/streamblocks/syntaxes/delimiter.py index 7ea15b5..324957d 100644 --- a/src/hother/streamblocks/syntaxes/delimiter.py +++ b/src/hother/streamblocks/syntaxes/delimiter.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable from hother.streamblocks.core.models import extract_block_types -from hother.streamblocks.core.types import BaseContent, BaseMetadata, DetectionResult, ParseResult +from hother.streamblocks.core.types import BaseContent, BaseMetadata, DetectionResult, ParseResult, SectionType from hother.streamblocks.syntaxes.base import BaseSyntax, YAMLFrontmatterMixin if TYPE_CHECKING: @@ -29,7 +29,45 @@ def parse(cls, raw_text: str) -> BaseContent: class DelimiterPreambleSyntax(BaseSyntax): """Syntax: !! delimiter with inline metadata. - Format: !!:[:param1:param2...] + This syntax uses delimiter markers with inline metadata in the opening line. + Metadata is extracted from the delimiter preamble, and all lines between + opening and closing delimiters become the content. + + Format: + !!:[:param1:param2:...] + Content lines here + !!end + + The opening delimiter must include: + - Block ID (alphanumeric, required) + - Block type (alphanumeric, required) + - Additional parameters (optional, colon-separated) + + Additional parameters are stored as param_0, param_1, etc. in metadata. + + Examples: + >>> # Simple block with just ID and type + >>> ''' + ... !!patch001:patch + ... Fix the login bug + ... !!end + ... ''' + >>> + >>> # Block with parameters + >>> ''' + ... !!file123:operation:create:urgent + ... Create new config file + ... !!end + ... ''' + >>> # Metadata will be: { + >>> # "id": "file123", + >>> # "block_type": "operation", + >>> # "param_0": "create", + >>> # "param_1": "urgent" + >>> # } + + Args: + delimiter: Opening delimiter string (default: "!!") """ def __init__( @@ -163,13 +201,54 @@ def parse_content_early(self, candidate: BlockCandidate) -> dict[str, Any] | Non class DelimiterFrontmatterSyntax(BaseSyntax, YAMLFrontmatterMixin): """Syntax: Delimiter markers with YAML frontmatter. + This syntax uses simple delimiter markers with YAML frontmatter for metadata. + The frontmatter section is delimited by --- markers and must be valid YAML. + Format: - !!start - --- - key: value - --- - content - !!end + !!start + --- + id: block_001 + block_type: example + custom_field: value + --- + Content lines here + !!end + + The YAML frontmatter should include: + - id: Block identifier (required if using BaseMetadata) + - block_type: Block type (required if using BaseMetadata) + - Any additional custom fields defined in your metadata class + + Examples: + >>> # Simple block with minimal metadata + >>> ''' + ... !!start + ... --- + ... id: msg001 + ... block_type: message + ... --- + ... Hello, world! + ... !!end + ... ''' + >>> + >>> # Block with nested YAML metadata + >>> ''' + ... !!start + ... --- + ... id: task001 + ... block_type: task + ... priority: high + ... tags: + ... - urgent + ... - backend + ... --- + ... Implement user authentication + ... !!end + ... ''' + + Args: + start_delimiter: Opening delimiter string (default: "!!start") + end_delimiter: Closing delimiter string (default: "!!end") """ def __init__( @@ -194,23 +273,23 @@ def detect_line(self, line: str, candidate: BlockCandidate | None = None) -> Det if line.strip() == self.start_delimiter: return DetectionResult(is_opening=True) # Inside a block - elif candidate.current_section == "header": + elif candidate.current_section == SectionType.HEADER: # Should be frontmatter start if self._frontmatter_pattern.match(line): - candidate.current_section = "metadata" + candidate.transition_to_metadata() return DetectionResult(is_metadata_boundary=True) # Skip empty lines in header - frontmatter might follow if line.strip() == "": return DetectionResult() # Move directly to content if no frontmatter - candidate.current_section = "content" + candidate.transition_to_content() candidate.content_lines.append(line) - elif candidate.current_section == "metadata": + elif candidate.current_section == SectionType.METADATA: if self._frontmatter_pattern.match(line): - candidate.current_section = "content" + candidate.transition_to_content() return DetectionResult(is_metadata_boundary=True) candidate.metadata_lines.append(line) - elif candidate.current_section == "content": + elif candidate.current_section == SectionType.CONTENT: if line.strip() == self.end_delimiter: return DetectionResult(is_closing=True) candidate.content_lines.append(line) @@ -219,7 +298,7 @@ def detect_line(self, line: str, candidate: BlockCandidate | None = None) -> Det def should_accumulate_metadata(self, candidate: BlockCandidate) -> bool: """Check if we're still in metadata section.""" - return candidate.current_section in ["header", "metadata"] + return candidate.current_section in {SectionType.HEADER, SectionType.METADATA} def extract_block_type(self, candidate: BlockCandidate) -> str | None: """Extract block_type from YAML frontmatter.""" @@ -247,14 +326,8 @@ def parse_block( if yaml_error: return ParseResult(success=False, error=f"YAML parse error: {yaml_error}", exception=yaml_error) - # Ensure id and block_type have defaults - # Only fill in defaults if using BaseMetadata (no custom class provided) - if metadata_class is BaseMetadata: - if "id" not in metadata_dict: - # Generate an ID based on hash of content - metadata_dict["id"] = f"block_{candidate.compute_hash()}" - if "block_type" not in metadata_dict: - metadata_dict["block_type"] = "unknown" + # Set default id and block_type if using BaseMetadata + self._set_default_metadata_fields(metadata_dict, candidate, metadata_class, default_type="unknown") # Parse metadata using helper metadata = self._safe_parse_metadata(metadata_class, metadata_dict) diff --git a/src/hother/streamblocks/syntaxes/markdown.py b/src/hother/streamblocks/syntaxes/markdown.py index 047d0c5..3c63232 100644 --- a/src/hother/streamblocks/syntaxes/markdown.py +++ b/src/hother/streamblocks/syntaxes/markdown.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any from hother.streamblocks.core.models import extract_block_types -from hother.streamblocks.core.types import BaseContent, BaseMetadata, DetectionResult, ParseResult +from hother.streamblocks.core.types import BaseContent, BaseMetadata, DetectionResult, ParseResult, SectionType from hother.streamblocks.syntaxes.base import BaseSyntax, YAMLFrontmatterMixin if TYPE_CHECKING: @@ -14,15 +14,67 @@ class MarkdownFrontmatterSyntax(BaseSyntax, YAMLFrontmatterMixin): - """Syntax: Markdown-style with YAML frontmatter. + """Syntax: Markdown fenced code blocks with YAML frontmatter. + + This syntax uses Markdown-style fenced code blocks with optional YAML frontmatter + for metadata. The info_string after the opening fence can be used as a fallback + block_type when no frontmatter is present. Format: - ```[info] - --- - key: value - --- - content - ``` + ```[info_string] + --- + id: block_001 + block_type: example + custom_field: value + --- + Content lines here + ``` + + The info_string is optional. When provided, it's used as the block_type if + no YAML frontmatter is present. The YAML frontmatter is also optional - if + omitted, all content becomes the block content. + + Examples: + >>> # Block with frontmatter + >>> ''' + ... ```python + ... --- + ... id: code001 + ... block_type: code + ... language: python + ... --- + ... def hello(): + ... print("Hello, world!") + ... ``` + ... ''' + >>> + >>> # Block without frontmatter (info_string becomes block_type) + >>> ''' + ... ```patch + ... diff --git a/file.py b/file.py + ... - old line + ... + new line + ... ``` + ... ''' + >>> # block_type will be "patch" from info_string + >>> + >>> # Block with nested YAML + >>> ''' + ... ```task + ... --- + ... id: task001 + ... block_type: task + ... assignees: + ... - alice + ... - bob + ... --- + ... Implement user authentication + ... ``` + ... ''' + + Args: + fence: Fence string (default: "```") + info_string: Optional info string used as fallback block_type """ def __init__( @@ -55,24 +107,24 @@ def detect_line(self, line: str, candidate: BlockCandidate | None = None) -> Det if self._fence_pattern.match(line): return DetectionResult(is_opening=True) # Inside a block - elif candidate.current_section == "header": + elif candidate.current_section == SectionType.HEADER: # Check if this is frontmatter start if self._frontmatter_pattern.match(line): - candidate.current_section = "metadata" + candidate.transition_to_metadata() return DetectionResult(is_metadata_boundary=True) # Skip empty lines in header - frontmatter might follow if line.strip() == "": return DetectionResult() # Non-empty, non-frontmatter line - move to content - candidate.current_section = "content" + candidate.transition_to_content() candidate.content_lines.append(line) - elif candidate.current_section == "metadata": + elif candidate.current_section == SectionType.METADATA: # Check for metadata end if self._frontmatter_pattern.match(line): - candidate.current_section = "content" + candidate.transition_to_content() return DetectionResult(is_metadata_boundary=True) candidate.metadata_lines.append(line) - elif candidate.current_section == "content": + elif candidate.current_section == SectionType.CONTENT: # Check for closing fence if line.strip() == self.fence: return DetectionResult(is_closing=True) @@ -82,7 +134,7 @@ def detect_line(self, line: str, candidate: BlockCandidate | None = None) -> Det def should_accumulate_metadata(self, candidate: BlockCandidate) -> bool: """Check if we're still in metadata section.""" - return candidate.current_section in ["header", "metadata"] + return candidate.current_section in {SectionType.HEADER, SectionType.METADATA} def extract_block_type(self, candidate: BlockCandidate) -> str | None: """Extract block_type from YAML frontmatter.""" @@ -97,22 +149,6 @@ def extract_block_type(self, candidate: BlockCandidate) -> str | None: # No block_type found in metadata or parse failed, return info_string return self.info_string - def _set_metadata_defaults( - self, - metadata_dict: dict[str, Any], - candidate: BlockCandidate, - metadata_class: type[BaseMetadata], - ) -> None: - """Set default values for id and block_type if using BaseMetadata.""" - if metadata_class is not BaseMetadata: - return - - if "id" not in metadata_dict: - metadata_dict["id"] = f"block_{candidate.compute_hash()}" - - if "block_type" not in metadata_dict: - metadata_dict["block_type"] = self.info_string or "markdown" - def _parse_metadata_instance( self, metadata_class: type[BaseMetadata], @@ -147,8 +183,10 @@ def parse_block( if yaml_error: return ParseResult(success=False, error=f"YAML parse error: {yaml_error}", exception=yaml_error) - # Set defaults for BaseMetadata - self._set_metadata_defaults(metadata_dict, candidate, metadata_class) + # Set default id and block_type if using BaseMetadata + self._set_default_metadata_fields( + metadata_dict, candidate, metadata_class, default_type=self.info_string or "markdown" + ) # Parse metadata instance metadata_result = self._parse_metadata_instance(metadata_class, metadata_dict) diff --git a/tests/syntaxes/test_delimiter.py b/tests/syntaxes/test_delimiter.py index 68eca68..e988952 100644 --- a/tests/syntaxes/test_delimiter.py +++ b/tests/syntaxes/test_delimiter.py @@ -8,7 +8,7 @@ from pydantic import ValidationError from hother.streamblocks.core.models import Block, BlockCandidate -from hother.streamblocks.core.types import BaseContent, BaseMetadata +from hother.streamblocks.core.types import BaseContent, BaseMetadata, SectionType from hother.streamblocks.syntaxes.delimiter import ( DelimiterFrontmatterSyntax, DelimiterPreambleSyntax, @@ -338,18 +338,18 @@ def test_detect_frontmatter_start_in_header(self) -> None: """Test detecting frontmatter start in header section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.detect_line("---", candidate) assert result.is_metadata_boundary is True - assert candidate.current_section == "metadata" + candidate.transition_to_metadata.assert_called_once() def test_detect_empty_line_in_header(self) -> None: """Test skipping empty lines in header section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.detect_line("", candidate) @@ -360,30 +360,30 @@ def test_detect_content_line_in_header_no_frontmatter(self) -> None: """Test moving to content when no frontmatter in header (lines 213-214).""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER candidate.content_lines = [] syntax.detect_line("content line", candidate) - assert candidate.current_section == "content" + candidate.transition_to_content.assert_called_once() assert "content line" in candidate.content_lines def test_detect_frontmatter_end_in_metadata(self) -> None: """Test detecting frontmatter end in metadata section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA result = syntax.detect_line("---", candidate) assert result.is_metadata_boundary is True - assert candidate.current_section == "content" + candidate.transition_to_content.assert_called_once() def test_detect_metadata_line(self) -> None: """Test accumulating metadata lines.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA candidate.metadata_lines = [] syntax.detect_line("key: value", candidate) @@ -394,7 +394,7 @@ def test_detect_closing_in_content(self) -> None: """Test detecting closing delimiter in content section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT result = syntax.detect_line("!!end", candidate) @@ -404,7 +404,7 @@ def test_detect_content_line(self) -> None: """Test accumulating content lines and return value (branch 220->225).""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT candidate.content_lines = [] result = syntax.detect_line("some content", candidate) @@ -421,8 +421,8 @@ def test_detect_line_unknown_section_returns_default(self) -> None: syntax = DelimiterFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - # Set to an unexpected section value - candidate.current_section = "unknown" + # Set to an unexpected section value (bypassing type safety for testing) + candidate.current_section = "unknown" # type: ignore[assignment] result = syntax.detect_line("some line", candidate) @@ -439,7 +439,7 @@ def test_should_accumulate_in_header(self) -> None: """Test should_accumulate_metadata returns True in header section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.should_accumulate_metadata(candidate) @@ -449,7 +449,7 @@ def test_should_accumulate_in_metadata(self) -> None: """Test should_accumulate_metadata returns True in metadata section.""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA result = syntax.should_accumulate_metadata(candidate) @@ -459,7 +459,7 @@ def test_should_not_accumulate_in_content(self) -> None: """Test should_accumulate_metadata returns False in content section (line 229).""" syntax = DelimiterFrontmatterSyntax() candidate = MagicMock(spec=BlockCandidate) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT result = syntax.should_accumulate_metadata(candidate) diff --git a/tests/syntaxes/test_markdown.py b/tests/syntaxes/test_markdown.py index a8626d7..e4d657c 100644 --- a/tests/syntaxes/test_markdown.py +++ b/tests/syntaxes/test_markdown.py @@ -7,7 +7,7 @@ from pydantic import ValidationError from hother.streamblocks.core.models import Block, BlockCandidate -from hother.streamblocks.core.types import BaseContent, BaseMetadata +from hother.streamblocks.core.types import BaseContent, BaseMetadata, SectionType from hother.streamblocks.syntaxes.markdown import MarkdownFrontmatterSyntax @@ -65,24 +65,24 @@ def test_detect_frontmatter_start_in_header(self) -> None: """Test detecting frontmatter start (---) in header section.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.detect_line("---", candidate) assert result.is_metadata_boundary is True - assert candidate.current_section == "metadata" + assert candidate.current_section == SectionType.METADATA def test_empty_line_in_header_stays_in_header(self) -> None: """Test that empty lines in header section don't change state.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.detect_line("", candidate) assert result.is_opening is False assert result.is_metadata_boundary is False - assert candidate.current_section == "header" + assert candidate.current_section == SectionType.HEADER def test_content_line_in_header_moves_to_content(self) -> None: """Test that non-empty, non-frontmatter line in header moves to content. @@ -91,11 +91,11 @@ def test_content_line_in_header_moves_to_content(self) -> None: """ syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.detect_line("Some content directly after fence", candidate) - assert candidate.current_section == "content" + assert candidate.current_section == SectionType.CONTENT assert "Some content directly after fence" in candidate.content_lines assert result.is_opening is False assert result.is_metadata_boundary is False @@ -104,30 +104,30 @@ def test_content_line_in_header_no_frontmatter_fence(self) -> None: """Test handling content immediately after fence (no --- frontmatter).""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER # First content line after fence syntax.detect_line("print('hello')", candidate) - assert candidate.current_section == "content" + assert candidate.current_section == SectionType.CONTENT assert candidate.content_lines == ["print('hello')"] def test_frontmatter_end_in_metadata_section(self) -> None: """Test detecting frontmatter end (---) in metadata section.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA result = syntax.detect_line("---", candidate) assert result.is_metadata_boundary is True - assert candidate.current_section == "content" + assert candidate.current_section == SectionType.CONTENT def test_metadata_line_accumulates(self) -> None: """Test that metadata lines are accumulated.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA syntax.detect_line("key: value", candidate) @@ -137,7 +137,7 @@ def test_closing_fence_in_content_section(self) -> None: """Test detecting closing fence in content section.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT result = syntax.detect_line("```", candidate) @@ -147,7 +147,7 @@ def test_content_line_accumulates(self) -> None: """Test that content lines are accumulated and return value (branch 77->83).""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT result = syntax.detect_line("content line", candidate) @@ -179,7 +179,7 @@ def test_should_accumulate_in_header_section(self) -> None: """Test returns True when in header section.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "header" + candidate.current_section = SectionType.HEADER result = syntax.should_accumulate_metadata(candidate) @@ -189,7 +189,7 @@ def test_should_accumulate_in_metadata_section(self) -> None: """Test returns True when in metadata section.""" syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "metadata" + candidate.current_section = SectionType.METADATA result = syntax.should_accumulate_metadata(candidate) @@ -202,7 +202,7 @@ def test_should_not_accumulate_in_content_section(self) -> None: """ syntax = MarkdownFrontmatterSyntax() candidate = BlockCandidate(syntax=syntax, start_line=1) - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT result = syntax.should_accumulate_metadata(candidate) @@ -557,7 +557,7 @@ def test_full_block_workflow_without_frontmatter(self) -> None: break # Verify we moved to content directly - assert candidate.current_section == "content" + assert candidate.current_section == SectionType.CONTENT assert len(candidate.content_lines) == 2 # Parse the complete block @@ -584,5 +584,5 @@ def test_custom_fence_syntax(self) -> None: assert syntax.detect_line("~~~").is_opening is True - candidate.current_section = "content" + candidate.current_section = SectionType.CONTENT assert syntax.detect_line("~~~", candidate).is_closing is True diff --git a/tests/test_models.py b/tests/test_models.py index 56d2063..4f91fda 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -20,7 +20,8 @@ def test_repr_initial_state(self) -> None: assert "start_line=1" in repr_str assert "state=header_detected" in repr_str assert "lines=0" in repr_str - assert "section='header'" in repr_str + assert "section=" in repr_str + assert "HEADER" in repr_str def test_repr_with_lines(self) -> None: """Test __repr__ after adding lines."""