elastic · sketchybinary · May 1, 2026 · May 1, 2026 · May 1, 2026
@@ -0,0 +1,18 @@
+FROM jruby:9.4.12.0-jdk21@sha256:5641622b488d298362b96fdaea0f328248ce55962e68e224118be11ddb48d16e
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    libicu-dev netbase make
+
+ENV IS_DOCKER=1
+
+RUN groupadd -g 451 crawlergroup && \
+    useradd -m -u 451 -g crawlergroup crawleruser
+
+USER crawleruser
+COPY --chown=crawleruser:crawlergroup --chmod=775 . /home/app
+WORKDIR /home/app
+
+# Install ALL gems including test/development
+RUN make clean install
+
+ENTRYPOINT [ "make", "test" ]
@@ -1,4 +1,4 @@
-.phony: test lint autocorrect install install-ci install-gems install-jars clean notice build-docker-ci list-gems list-jars
+.phony: test lint autocorrect install install-ci install-gems install-jars clean notice build-docker-ci build-docker-wolfi list-gems list-jars push
 
 test:
 	script/rspec $(file)
@@ -36,6 +36,10 @@ build-docker-ci:
 build-docker-wolfi:
 	docker build -t crawler-ci-wolfi -f Dockerfile.wolfi .
 
+push:
+	docker tag crawler-ci corgicloud/crawler:tagname
+	docker push corgicloud/crawler:tagname
+
 list-gems:
 	script/bundle exec gem dependency
 

@@ -243,6 +243,7 @@ GET /web-crawl-test/_search
 
 - [Crawl lifecycle](docs/ADVANCED.md#crawl-lifecycle) - Understand how the crawler discovers, queues, and indexes content across two stages: the primary crawl and the purge crawl
 - [Extraction rules](docs/features/EXTRACTION_RULES.md) - Define how crawler extracts content from HTML
+- [Markdown reformatting](docs/features/MARKDOWN_REFORMATTING.md) - Reformat HTML content into Markdown for LLM/RAG use
 - [Binary content extraction](docs/features/BINARY_CONTENT_EXTRACTION.md) - Extract text from PDFs, DOCX files
 - [Crawler directives](docs/features/CRAWLER_DIRECTIVES.md) - Use robots.txt, meta tags, or embedded data attributes to guide discovery and content extraction
 - [Scheduling](docs/features/SCHEDULING.md) - Automate crawls with cron scheduling

@@ -83,7 +83,11 @@
 ##   dramatically increase the index size if the site being crawled is large. Defaults to false.
 #full_html_extraction_enabled: false
 #
+## Whether or not to reformat HTML into Markdown for indexing. Defaults to false.
+#markdown_reformatting_enabled: false
+
 ## Scheduling using cron expressions
+
 #schedule:
 #  pattern: "0 12 * * *"     # every day at noon
 #

@@ -0,0 +1,6 @@
+output_sink: console
+markdown_reformatting_enabled: true
+domains:
+  - url: https://www.elastic.co
+    seed_urls:
+      - https://www.elastic.co/docs/solutions/search
@@ -0,0 +1,35 @@
+# HTML to Markdown Reformatting
+
+
+The Open Crawler can automatically reformat HTML content into Markdown before indexing it into Elasticsearch. This is particularly useful for Large Language Model (LLM) applications and Retrieval-Augmented Generation (RAG), as Markdown is often a preferred format for these systems.
+
+## Configuration
+
+To enable Markdown reformatting, set `markdown_reformatting_enabled: true` in your crawler configuration file.
+
+```yaml
+# crawler.yml
+markdown_reformatting_enabled: true
+```
+
+## How it Works
+
+When enabled, the crawler will:
+
+1.  Extract the relevant content from the HTML body (respecting `exclude_tags` and `data-elastic-exclude` attributes).
+2.  Convert common HTML tags into their Markdown equivalents:
+    *   Headers (`<h1>` - `<h6>`)
+    *   Paragraphs (`<p>`) and line breaks (`<br>`)
+    *   Bold (`<strong>`, `<b>`) and Italic (`<em>`, `<i>`)
+    *   Links (`<a>`)
+    *   Lists (`<ul>`, `<ol>`, `<li>`)
+    *   Images (`<img>`)
+    *   Code blocks (`<code>`, `<pre>`)
+3.  Remove non-content tags like `<script>`, `<style>`, `<svg>`, etc.
+4.  Consolidate multiple newlines to maintain clean Markdown structure.
+
+## Benefits for RAG
+
+*   **Improved Context:** LLMs are typically trained on large amounts of Markdown content (e.g., from GitHub, Wikipedia), making them more effective at parsing and understanding reformatted web content.
+*   **Reduced Token Usage:** Markdown often uses fewer characters than raw HTML to represent the same structure, helping to optimize token usage in LLM prompts.
+*   **Cleaner Data:** By removing unnecessary HTML boilerplate and non-content tags, you ensure that the indexed data is focused on the actual information.
@@ -63,6 +63,7 @@ class Config # rubocop:disable Metrics/ClassLength
         :stats_dump_interval,  # How often should we output stats in the logs during a crawl
         :purge_crawl_enabled,  # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
         :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON
+        :markdown_reformatting_enabled, # Whether or not to reformat HTML into Markdown for indexing
 
         # Elasticsearch settings
         :elasticsearch, # Elasticsearch connection settings
@@ -206,6 +207,7 @@ class Config # rubocop:disable Metrics/ClassLength
         crawl_rules: {},
         purge_crawl_enabled: true,
         full_html_extraction_enabled: false,
+        markdown_reformatting_enabled: false,
 
         # Sink lock retry settings
         sink_lock_retry_interval: 1,

@@ -0,0 +1,79 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0;
+# you may not use this file except in compliance with the Elastic License 2.0.
+#
+
+# frozen_string_literal: true
+
+module Crawler
+  module ContentEngine
+    module Markdown
+      java_import org.jsoup.nodes.TextNode
+      java_import org.jsoup.nodes.Element
+
+      # Convert a Jsoup HTML node to Markdown
+      def self.convert(node)
+        return '' unless node
+
+        markdown = []
+        process_node(node, markdown)
+        markdown.join.gsub(/\n{3,}/, "\n\n").strip
+      end
+
+      def self.process_node(node, markdown, list_type = nil)
+        if node.is_a?(TextNode)
+          text = node.text
+          markdown << text if text && !text.empty?
+          return
+        end
+
+        return unless node.is_a?(Element)
+
+        tag = node.tagName.downcase
+        return if Crawler::ContentEngine::Utils::NON_CONTENT_TAGS.include?(tag)
+
+        handle_start_tag(tag, node, markdown, list_type)
+
+        # Process children
+        list_type_for_children = %w[ul ol].include?(tag) ? tag.to_sym : list_type
+        node.childNodes.each { |child| process_node(child, markdown, list_type_for_children) }
+
+        handle_end_tag(tag, node, markdown)
+      end
+
+      START_TAG_MAPPING = {
+        'h1' => "\n\n# ", 'h2' => "\n\n## ", 'h3' => "\n\n### ",
+        'h4' => "\n\n#### ", 'h5' => "\n\n##### ", 'h6' => "\n\n###### ",
+        'p' => "\n\n", 'br' => "\n", 'strong' => '**', 'b' => '**',
+        'em' => '*', 'i' => '*', 'a' => '[', 'ul' => "\n", 'ol' => "\n",
+        'code' => '`', 'pre' => "\n```\n"
+      }.freeze
+
+      END_TAG_MAPPING = {
+        'h1' => "\n\n", 'h2' => "\n\n", 'h3' => "\n\n",
+        'h4' => "\n\n", 'h5' => "\n\n", 'h6' => "\n\n", 'p' => "\n\n",
+        'strong' => '**', 'b' => '**', 'em' => '*', 'i' => '*',
+        'code' => '`', 'pre' => "\n```\n"
+      }.freeze
+
+      def self.handle_start_tag(tag, node, markdown, list_type)
+        if START_TAG_MAPPING.key?(tag)
+          markdown << START_TAG_MAPPING[tag]
+        elsif tag == 'li'
+          markdown << (list_type == :ol ? "\n1. " : "\n* ")
+        elsif tag == 'img'
+          markdown << "![#{node.attr('alt')}](#{node.attr('src')})"
+        end
+      end
+
+      def self.handle_end_tag(tag, node, markdown)
+        if END_TAG_MAPPING.key?(tag)
+          markdown << END_TAG_MAPPING[tag]
+        elsif tag == 'a'
+          markdown << "](#{node.attr('href')})"
+        end
+      end
+    end
+  end
+end
@@ -13,11 +13,18 @@ module ContentEngine
     module Utils
       # A list of tags we want to remove before extracting content
       NON_CONTENT_TAGS = %w[
+        audio
+        canvas
         comment
+        embed
+        iframe
+        math
+        noscript
         object
         script
         style
         svg
+        template
         video
       ].freeze
 

@@ -8,6 +8,7 @@
 
 require_dependency(File.join(__dir__, 'success'))
 require_dependency(File.join(__dir__, '..', '..', '..', 'constants'))
+require_dependency(File.join(__dir__, '..', '..', 'content_engine', 'markdown'))
 
 java_import org.jsoup.nodes.TextNode
 
@@ -192,12 +193,16 @@ def document_title(limit: 1000)
         end
 
         # Returns the body of the document, cleaned up for indexing
-        def document_body(limit: 5.megabytes, exclude_tags: nil)
+        def document_body(limit: 5.megabytes, exclude_tags: nil, markdown: false)
           body_tag = get_body_tag(exclude_tags)
           return '' unless body_tag
 
           body_tag = Crawler::ContentEngine::Transformer.transform(body_tag)
-          body_content = Crawler::ContentEngine::Utils.node_descendant_text(body_tag)
+          body_content = if markdown
+                           Crawler::ContentEngine::Markdown.convert(body_tag)
+                         else
+                           Crawler::ContentEngine::Utils.node_descendant_text(body_tag)
+                         end
           Crawler::ContentEngine::Utils.limit_bytesize(body_content, limit)
         end
 

@@ -68,7 +68,11 @@ def core_fields(crawl_result)
     def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize
       remove_empty_values(
         title: crawl_result.document_title(limit: config.max_title_size),
-        body: crawl_result.document_body(limit: config.max_body_size, exclude_tags: config.exclude_tags),
+        body: crawl_result.document_body(
+          limit: config.max_body_size,
+          exclude_tags: config.exclude_tags,
+          markdown: config.markdown_reformatting_enabled
+        ),
         meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size),
         meta_description: crawl_result.meta_description(limit: config.max_description_size),
         links: crawl_result.links(limit: config.max_indexed_links_count),

@@ -4,6 +4,9 @@
 # you may not use this file except in compliance with the Elastic License 2.0.
 #
 
+#
+#
+
 # frozen_string_literal: true
 
 require 'yaml'
@@ -65,6 +68,51 @@
       expect(config.seed_urls.map(&:to_s).to_a).to match_array(expected_seed_urls)
       expect(config.output_sink).to eq(:elasticsearch)
       expect(config.output_dir).to eq('./crawled_docs')
+      expect(config.markdown_reformatting_enabled).to be(false)
+    end
+
+    it 'can enable markdown reformatting and other features' do
+      config = Crawler::API::Config.new(
+        domains:,
+        markdown_reformatting_enabled: true,
+        sitemap_discovery_disabled: true,
+        head_requests_enabled: true,
+        compression_enabled: false,
+        default_encoding: 'ISO-8859-1',
+        max_duration: 3600,
+        max_crawl_depth: 5,
+        max_unique_url_count: 1000,
+        max_url_length: 512,
+        max_url_params: 10,
+        max_url_segments: 8,
+        max_body_size: 1.megabyte,
+        max_response_size: 2.megabytes,
+        elasticsearch: {
+          host: 'http://es:9200',
+          index: 'test-index',
+          pipeline: 'test-pipeline'
+        },
+        http_proxy_host: 'proxy',
+        http_proxy_port: 3128,
+        http_proxy_protocol: 'https',
+        http_proxy_username: 'user',
+        http_proxy_password: 'pass'
+      )
+
+      expect(config.markdown_reformatting_enabled).to be(true)
+      expect(config.sitemap_discovery_disabled).to be(true)
+      expect(config.head_requests_enabled).to be(true)
+      expect(config.compression_enabled).to be(false)
+      expect(config.default_encoding).to eq('ISO-8859-1')
+      expect(config.max_duration).to eq(3600)
+      expect(config.max_crawl_depth).to eq(5)
+      expect(config.max_unique_url_count).to eq(1000)
+      expect(config.elasticsearch[:host]).to eq('http://es:9200')
+      expect(config.http_proxy_host).to eq('proxy')
+      expect(config.http_proxy_port).to eq(3128)
+      expect(config.http_proxy_protocol).to eq('https')
+      expect(config.http_proxy_username).to eq('user')
+      expect(config.http_proxy_password).to eq('pass')
     end
 
     context 'when a domain has an internationalized domain name' do
@@ -398,5 +446,25 @@ def expect_x509_certificates(certs)
         end
       end
     end
+
+    describe 'sensitive fields' do
+      it 'identifies sensitive fields correctly' do
+        expect(Crawler::API::Config::SENSITIVE_FIELDS).to include(:auth)
+        expect(Crawler::API::Config::SENSITIVE_FIELDS).to include(:http_header_service)
+        expect(Crawler::API::Config::SENSITIVE_FIELDS).to include(:elasticsearch)
+      end
+    end
+
+    describe 'default values' do
+      let(:config) { Crawler::API::Config.new(domains:) }
+
+      it 'has correct default values' do
+        expect(config.stats_dump_interval).to eq(10)
+        expect(config.purge_crawl_enabled).to be(true)
+        expect(config.full_html_extraction_enabled).to be(false)
+        expect(config.sink_lock_retry_interval).to eq(1)
+        expect(config.sink_lock_max_retries).to eq(120)
+      end
+    end
   end
 end