diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 702b10c68..55798347c 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,7 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + EmlConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -195,6 +196,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) + self.register_converter(EmlConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..650ad9df2 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -17,6 +17,7 @@ from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter +from ._eml_converter import EmlConverter from ._doc_intel_converter import ( DocumentIntelligenceConverter, DocumentIntelligenceFileType, @@ -45,4 +46,5 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "EmlConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_eml_converter.py b/packages/markitdown/src/markitdown/converters/_eml_converter.py new file mode 100644 index 000000000..e1dcfe2e8 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_eml_converter.py @@ -0,0 +1,152 @@ +from typing import Any, BinaryIO, List, Tuple + +from email import policy +from email.parser import BytesParser +from email.utils import getaddresses + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "message/", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".eml", +] + + +class EmlConverter(DocumentConverter): + """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info.""" + + def accepts( + self, + file_stream: BinaryIO, # noqa: ARG002 - required by interface + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check the extension and mimetype + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, # noqa: ARG002 - kept for interface compatibility + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + """Convert an EML message to markdown.""" + _ = kwargs # Currently unused + + # Read the full message from the binary stream and parse it + raw_bytes = file_stream.read() + msg = BytesParser(policy=policy.default).parsebytes(raw_bytes) + + # Build markdown content + md_parts: List[str] = [] + + # Add email headers + md_parts.append("## Email Headers\n") + + # Helper to format address headers that can contain multiple addresses + def _format_address_header(header_name: str) -> Tuple[str, str]: + raw_values = msg.get_all(header_name, []) + if not raw_values: + return header_name, "" + + addresses = getaddresses(raw_values) + formatted = [] + for name, addr in addresses: + if name and addr: + formatted.append(f"{name} <{addr}>") + elif addr: + formatted.append(addr) + return header_name, ", ".join(formatted) + + # From, To, Cc, Bcc in a readable format + for header in ["From", "To", "Cc", "Bcc"]: + key, value = _format_address_header(header) + if value: + md_parts.append(f"**{key}:** {value}") + + # Other common headers + subject = msg.get("Subject", "") + if subject: + md_parts.append(f"**Subject:** {subject}") + + date = msg.get("Date", "") + if date: + md_parts.append(f"**Date:** {date}") + + md_parts.append("\n## Email Content\n") + + # Prefer plain text body; fall back to HTML if no plain text part exists + body_text: List[str] = [] + has_text_plain = False + + if msg.is_multipart(): + # First pass: check if there is any text/plain part + for part in msg.walk(): + if part.get_content_type() == "text/plain": + has_text_plain = True + break + + for part in msg.walk(): + content_type = part.get_content_type() + disposition = part.get_content_disposition() + + # Skip attachments when extracting the main body + if disposition == "attachment": + continue + + if content_type == "text/plain": + body_text.append(part.get_content()) + elif content_type == "text/html" and not has_text_plain: + # If we have HTML content but no plain text, fall back to HTML + body_text.append(part.get_content()) + else: + # Single-part message + content_type = msg.get_content_type() + if content_type in ("text/plain", "text/html", "text/rfc822-headers"): + body_text.append(msg.get_content()) + + if body_text: + md_parts.append("\n".join(body_text)) + + # List attachments, if any + attachments: List[str] = [] + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_disposition() == "attachment": + filename = part.get_filename() + if filename: + try: + payload = part.get_content() + size = len(payload) if isinstance(payload, (bytes, str)) else 0 + except Exception: + size = 0 + mime_type = part.get_content_type() + attachments.append( + f"- {filename} ({mime_type}, {size:,} bytes)" + ) + + if attachments: + md_parts.append("\n## Attachments\n") + md_parts.extend(attachments) + + markdown = "\n".join(md_parts).strip() + + return DocumentConverterResult( + markdown=markdown, + title=subject or None, + ) \ No newline at end of file diff --git a/packages/markitdown/tests/test_files/test.eml b/packages/markitdown/tests/test_files/test.eml new file mode 100644 index 000000000..15f6b85cf --- /dev/null +++ b/packages/markitdown/tests/test_files/test.eml @@ -0,0 +1,33 @@ +Content-Type: multipart/mixed; boundary="===============8484938434343225034==" +MIME-Version: 1.0 +Subject: Test Email Document +From: John Doe +To: Jane Smith +Date: Wed, 18 Dec 2024 10:00:00 +0000 +CC: cc.person@example.com + +--===============8484938434343225034== +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + + +This is a test email with multiple parts. + +It contains: +- Plain text content +- An attachment +- Various headers + +Best regards, +John Doe + +--===============8484938434343225034== +Content-Type: application/txt +MIME-Version: 1.0 +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="test.txt" + +VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA== + +--===============8484938434343225034==--