crawler/config/crawler.yml.example at main · elastic/crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
## ================== Open Crawler Configuration - Elasticsearch ====================
#
##  Open Crawler configuration settings. One configuration file can be used to
##       define one Open Crawler/crawl job
#
##  NOTE: Most Open Crawler configurations comes with reasonable defaults.
##       Before adjusting the configuration, make sure you understand what you
##       are trying to accomplish and the consequences.
#
## ------------------------------- Open Crawler ------------------------------------
#
## The domain(s) that Open Crawler will crawl. This is an array. All domains in this
##   array will be crawled concurrently by the Open Crawler with a shared output.
##   They are separated to allow for domain-specific configurations.
#
#domains:
#  - url: http://localhost:8000         # The base URL for this domain
#    seed_urls:                         # The entry point(s) for crawl jobs
#      - http://localhost:8000/foo
#      - http://localhost:8000/bar
#    sitemap_urls:                      # The location(s) of sitemap files
#      - http://localhost:8000/sitemap.xml
#    auth:
#      type: basic                      # One of two types: raw | basic
#      # Basic authentication
#      #   Requires a username and password
#      username: local
#      password: your-password-here
#      # Raw authentication
#      #   Requires the raw header string you wish to provide to each URL request.
#      #   For example, when providing a JWT token, you must provide it with the 'Bearer:' prefix:
#      # "Bearer: eyJhb.eyJzd.KMUFs"
#      header: your-auth-header
#
#   # An array of HTML5 tags to exclude from the crawl
#    exclude_tags:
#      - header
#      - footer
#
#    # An array of crawl rules
#    # See docs/features/CRAWL_RULES.md for more details on this feature
#    crawl_rules:
#      - policy: deny       # the policy for this rule, either: allow | deny
#        type: begins       # the type of rule, any of: begins | ends | contains | regex
#        pattern: /blog     # the pattern string for the rule
#
#    # An array of content extraction rules
#    # See docs/features/EXTRACTION_RULES.md for more details on this feature
#    extraction_rulesets:
#      - url_filters:
#          - type: begins           # Filter type, can be: begins | ends | contains | regex
#            pattern: /blog         # The pattern for the filter
#        rules:
#          - action: extract        # Rule action, can be: extract | set
#            field_name: author     # The ES doc field to add the value to
#            selector: .author      # CSS or XPATH selector if source is `html`, regexp if source is `url`
#            join_as: array         # How to concatenate multiple values, can be: array | string
#            value: yes             # The value to use, only applicable if action is `set`
#            source: html           # The source to extract from, can be: html | url
#
## Where to send the results. Possible values are console, file, or elasticsearch
#output_sink: elasticsearch
#
## Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
#output_index: my-index
#
## Local directory to output crawl results. The defult value is ./crawled_docs
##   This will appear at the top level of your Open Crawler directory if running from source,
##   and under the home/app/ directory inside your container if running via Docker.
#output_dir: output/local-site
#
## The maximum depth that Open Crawler will follow links to.
#max_crawl_depth: 2
#
## The maximum number of links Open Crawler can extract from each page.
##   Default value is 1000.
#max_extracted_links_count: 1000
#
## Whether or not the Open Crawler should purge outdated documents after completing a crawl. Defaults to true
#purge_crawl_enabled: true
#
## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
##   dramatically increase the index size if the site being crawled is large. Defaults to false.
#full_html_extraction_enabled: false
#
## Scheduling using cron expressions
#schedule:
#  pattern: "0 12 * * *"     # every day at noon
#
## Crawl result field size limits
#max_title_size: 1000
#max_body_size: 5_242_880 # 5 megabytes
#max_keywords_size: 512
#max_description_size: 512
#max_indexed_links_count: 10
#max_headings_count: 10
#max_elastic_tag_size: 512
#max_data_attribute_size: 512
#max_response_size: 10_485_760 # 10 megabytes

## ------------------------------- URL Limits ----------------------------------
## The maximum amount of time, in seconds, to crawl a single web page.
##   Default value is 24 hours.
#max_duration: 86_400 # 24 hours
#
## The maximum number of unique URLs Open Crawler can process before stopping.
##   Default is 100000.
#max_unique_url_count: 100_000
#
## The maximum length a URL can be, in characters.
##   Default value is 2048 characters.
#max_url_length: 2048
#
## The maximum amount of segments a URL can contain.
##   Default value is 16.
#max_url_segments: 16
#
## The maximum amount of parameters a URL can contain.
##   Default value is 10.
#max_url_params: 10
#
## The maximum number of redirects the Open Crawler can follow before raising an error.
#    Default value is 10.
#max_redirects: 10
#
## ------------------------------- Connection Timeout --------------------------
#
## Timeout value, in seconds, for establishing connections.
##   Default value is 10 seconds.
#connect_timeout: 10
#
## The timeout value, in seconds, before a socket connection is timed out.
#    Default value is 10 seconds.
#socket_timeout: 10
#
## The per-request timeout value, in seconds.
##   Default value is 60 seconds.
#request_timeout: 60
#
## ------------------------------- Open Crawler - Advanced --------------------------
#
## The maximum amount of threads to utilize.
#    Default value is 10.
#threads_per_crawl: 10
#
## The maximum number of in-flight URLs the in-memory queue can hold.
##   Default value is 10000.
#url_queue_size_limit: 10_000
#
## The default encoding to use. Open Crawler uses ‘UTF-8’ as standard.
#default_encoding: 'UTF-8'
#
## Enable HTTP content compression.
##   Default value is true.
#compression_enabled: true
#
## Allow performing HEAD requests before GET requests when crawling websites.
##   Default is false
#head_requests_enabled: false
#
## Set a custom User-agent name for the requests Open Crawler makes.
##   Default is Elastic-Crawler (crawler-version-number-here).
##   Ex:   Elastic-Crawler (0.3.0)
#user_agent: custom-user-agent
#
## Proxy configurations.
#http_proxy_host: localhost
#http_proxy_port: 8888
#http_proxy_protocol: http
#http_proxy_username: ent-search
#http_proxy_password: changeme
#
## Whether authenticated crawling of non-HTTPS URLs is allowed.
## Defaults to 'false'.
#http_auth_allowed: false
#
## Allow Open Crawler to access localhost.
## Defaults to 'false'.
#loopback_allowed: true
#
## Allow Open Crawler to access private IP space
## Defaults to 'false'.
#private_networks_allowed: false
#
## SSL configurations.
## Whether to perform SSL Verification when crawling. Defaults to `true`
#ssl_verification_mode: none
#
## When crawling, these custom certificate authorities are considered trusted
##   CA Certificates can be provided in one of three ways
##     1. CA Certificates can be read from a file:
##     2. They can be provided as a single line with new lines replaced with `\n`
##     3. Finally, they can be provided as a multi-line YAML String
#ssl_ca_certificates:
# - /path/to/ssl/certificate.pem
# - "-----BEGIN CERTIFICATE-----\nMIID...\n...CERT DATA...\n-----END CERTIFICATE-----"
# - |-
#   -----BEGIN CERTIFICATE-----
#   MIID...........
#   ...CERT DATA...
#   -----END CERTIFICATE-----
#

## Whether document metadata from certain content types will be indexed or not.
##     This does not allow binary content to be indexed from these files, only metadata.
##     See docs/features/BINARY_CONTENT_EXTRACTION.md for more details.
#binary_content_extraction_enabled: true
#binary_content_extraction_mime_types:
#  - application/pdf
#  - application/msword
#  - application/vnd.openxmlformats-officedocument.wordprocessingml.document
#  - application/vnd.ms-powerpoint
#  - application/vnd.openxmlformats-officedocument.presentationml.presentation
#
## ------------------------------- Logging -------------------------------------
#
## The log level for system logs. Defaults to `info`
#log_level: info
#
## Whether or not event logging is enabled for output to file.
##    Event logs are noisy but have a lot of granularity, these can
##    be useful for debugging failing Crawlers.
#     Defaults to false
#event_logs_to_file: false
#
## Whether or not system logs are enabled for output to file.
##    Defaults to false
#system_logs_to_file: false
#
## The directory under which Open Crawler will save log files.
##    By default, it will create a folder called 'logs/'
##    at the top level of Open Crawler's working directory.
#log_file_directory: "./logs"
#
## The rotation policy for log files. Defaults to 'weekly'.
##    Choices are 'daily' | 'weekly' | 'monthly'
#log_file_rotation_policy: 'weekly'
#
## How often Crawler will output stats in logging output during a crawl.
##   Default is 10 seconds.
#stats_dump_interval: 10
#
## Set a custom crawl ID for this crawler.
##   Will appear in log messages as [crawl:unique-crawl_id]
##   Default is a random hash value.
#crawl_id: custom-crawl-id
#
## ------------------------------- Elasticsearch -------------------------------
#
## Elasticsearch connection settings for this specific crawler/crawl job.
##     See elasticsearch.yml.example for detailed configurations.
##
#elasticsearch:
#  host: http://localhost
#  port: 9200
#  username: elastic
#  password: changeme
#  api_key: 1234
#  pipeline: ent-search-generic-ingestion
#  pipeline_enabled: true
#  pipeline_params:
#    _reduce_whitespace: true
#    _run_ml_inference: true
#    _extract_binary_content: true
#  bulk_api:
#    max_items: 10
#    max_size_bytes: 1_048_576

## The interval in seconds to wait before retrying to acquire the sink lock.
#sink_lock_retry_interval: 1

## The maximum number of times to retry acquiring the sink lock.
#sink_lock_max_retries: 120