diff --git a/Dockerfile b/Dockerfile index 62c461f3..3e7bac4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,11 +26,12 @@ RUN apt-get install -y \ libbz2-dev \ liblzma-dev \ liblz4-dev \ - libzstd-dev + libzstd-dev \ + libbrotli-dev -RUN cd / &&\ +RUN cd / && \ git clone https://github.com/Genivia/ugrep -RUN cd ugrep &&\ - ./build.sh &&\ +RUN cd ugrep && \ + ./build.sh && \ make install diff --git a/Dockerfile-minimized b/Dockerfile-minimized index 1df3f569..34b3d5e9 100644 --- a/Dockerfile-minimized +++ b/Dockerfile-minimized @@ -1,4 +1,4 @@ -# step 1: create a debian or ubuntu container for ugrep named "ugrep" +# step 1: create a debian or ubuntu minimized container for ugrep named "ugrep" # docker -D build --no-cache -t ugrep . # # step 2: run bash in the container, e.g. to run ugrep from the command line @@ -22,7 +22,8 @@ RUN apt-get update && \ libbz2-dev \ liblzma-dev \ liblz4-dev \ - libzstd-dev && \ + libzstd-dev \ + libbrotli-dev && \ git clone --depth=1 https://github.com/Genivia/ugrep && \ cd ugrep && \ ./build.sh && \ @@ -38,5 +39,6 @@ RUN apt-get update && \ libbz2-dev \ liblzma-dev \ liblz4-dev \ - libzstd-dev && \ + libzstd-dev \ + libbrotli-dev && \ rm -rf /var/lib/apt/lists/* diff --git a/Makefile.am b/Makefile.am index 7267f805..e8395fa1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -68,9 +68,9 @@ install-data-hook: fi @if [ "x$(zshcompletiondir)" != "x" ]; then \ cd $(DESTDIR)$(zshcompletiondir) && \ - sed -e 's/^#compdef ug/#compdef ug+/' _ug > _ug+ && \ - sed -e 's/^#compdef ug/#compdef ugrep/' _ug > _ugrep && \ - sed -e 's/^#compdef ug/#compdef ugrep+/' _ug > _ugrep+; \ + $(LN_S) -f _ug _ug+ && \ + $(LN_S) -f _ug _ugrep && \ + $(LN_S) -f _ug _ugrep+; \ fi @echo " ______________________________________________________ "; \ echo "| |"; \ diff --git a/Makefile.in b/Makefile.in index 4b174066..6453d202 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1040,9 +1040,9 @@ install-data-hook: fi @if [ "x$(zshcompletiondir)" != "x" ]; then \ cd $(DESTDIR)$(zshcompletiondir) && \ - sed -e 's/^#compdef ug/#compdef ug+/' _ug > _ug+ && \ - sed -e 's/^#compdef ug/#compdef ugrep/' _ug > _ugrep && \ - sed -e 's/^#compdef ug/#compdef ugrep+/' _ug > _ugrep+; \ + $(LN_S) -f _ug _ug+ && \ + $(LN_S) -f _ug _ugrep && \ + $(LN_S) -f _ug _ugrep+; \ fi @echo " ______________________________________________________ "; \ echo "| |"; \ diff --git a/README.md b/README.md index 710e5c24..37c236f0 100644 --- a/README.md +++ b/README.md @@ -403,31 +403,22 @@ You can always add these later, when you need these features: the [zstd](http://facebook.github.io/zstd) library (optional, not required), e.g. with `sudo apt-get install -y libzstd-dev`. To search `.br` files, install the [brotli](https://github.com/google/brotli) library (optional, not - required), e.g. with `sudo apt-get install -y brotli`. To search `.bz3` - files, install the [bzip3](https://github.com/kspalaiologos/bzip3) library - (optional, not required), e.g. with `sudo apt-get install -y bzip3`. + required), e.g. with `sudo apt-get install -y libbrotli-dev`. To search + `.bz3` files, install the [bzip3](https://github.com/kspalaiologos/bzip3) + library (optional, not required), e.g. with `sudo apt-get install -y bzip3`. **Note:** even if your system has command line utilities, such as `bzip2`, that does not necessarily mean that the development libraries such as `libbz2` are installed. The *development libraries* should be installed. -After installing one or more of these libraries, re-execute the commands to -rebuild `ugrep`: - - $ cd ugrep - $ ./build.sh - **Note:** some Linux systems may not be configured to load dynamic libraries from `/usr/local/lib`, causing a library load error when running `ugrep`. To correct this, add `export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib"` to your `~/.bashrc` file. Or run `sudo ldconfig /usr/local/lib`. -**Note:** you can build static executables by supplying `--enable-static` -as an argument to `./build.sh`. - ### Other platforms: step 3 build -Build `ugrep` on Unix-like systems with colors enabled by default: +Execute the `./build.sh` script to build `ugrep`: $ cd ugrep $ ./build.sh @@ -441,24 +432,36 @@ Note that `ug` is the same as `ugrep` but also loads the configuration file .ugrep when present in the working directory or home directory. This means that you can define your default options for `ug` in .ugrep. -To build `ugrep` with specific hard defaults enabled, such as a pager: +Alternative paths to installed or local libraries may be specified with +`./build.sh`. To get help on the available build options: + + $ ./build.sh --help + +You can build static executables by specifying: + + $ ./build.sh --enable-static + +**Note:** this may fail if libraries don't link statically, such as brotli. +In that case try `./build.sh --enable-static --without-brotli`. + +You can build `ugrep` with customized defaults enabled, such as a pager: - $ cd ugrep $ ./build.sh --enable-pager Options to select defaults for builds include: +- `--help` display build options +- `--enable-static` build static executables, if possible - `--enable-hidden` always search hidden files and directories - `--enable-pager` always use a pager to display output on terminals - `--enable-pretty` colorize output to terminals and add filename headings - `--disable-auto-color` disable automatic colors, requires ugrep option `--color=auto` to show colors - `--disable-mmap` disable memory mapped files - `--disable-sse2` disable SSE2 and AVX optimizations -- `--disable-avx` disable AVX optimizations, but compile with SSE2 when supported +- `--disable-avx2` disable AVX2 and AVX512BW optimizations, but compile with SSE2 when supported - `--disable-neon` disable ARM NEON/AArch64 optimizations - `--with-grep-path` the default `-f` path if `GREP_PATH` is not defined - `--with-grep-colors` the default colors if `GREP_COLORS` is not defined -- `--help` display build options After the build completes, copy `ugrep/bin/ugrep` and `ugrep/bin/ug` to a convenient location, for example in your `~/bin` directory. Or, if you may want @@ -1189,7 +1192,7 @@ the top of the file, followed by the long options to override the defaults. Shift-Tab to navigate directories and to select a file to search. Press Enter to select lines to output. Press ALT-l for option -l to list files, ALT-n for -n, etc. Non-option commands include - ALT-] to increase context. See also options --confirm, --delay, + ALT-] to increase context. See also options --no-confirm, --delay, --split and --view. --no-confirm Do not confirm actions in -Q query TUI. The default is confirm. @@ -1643,18 +1646,11 @@ same line, like XOR: -w, --word-regexp The PATTERN is searched for as a word, such that the matching text is preceded by a non-word character and is followed by a non-word - character. Word characters are letters, digits, and the - underscore. With option -P, word characters are Unicode letters, - digits, and underscore. This option has no effect if -x is also - specified. If a PATTERN is specified, or -e PATTERN or -N PATTERN, - then this option has no effect on -f FILE patterns to allow -f FILE - patterns to narrow or widen the scope of the PATTERN search. + character. Word-like characters are Unicode letters, digits and + connector punctuations such as underscore. -x, --line-regexp Select only those matches that exactly match the whole line, as if - the patterns are surrounded by ^ and $. If a PATTERN is specified, - or -e PATTERN or -N PATTERN, then this option does not apply to - -f FILE patterns to allow -f FILE patterns to narrow or widen the - scope of the PATTERN search. + the patterns are surrounded by ^ and $. See also [Boolean query patterns with -%, -%%, --and, --not](#bool) for more powerful Boolean query options than the traditional GNU/BSD grep options. @@ -3944,9 +3940,9 @@ in markdown: Option -f FILE matches patterns specified in FILE. - By default Unicode patterns are matched. Option -U (--binary) disables - Unicode matching for ASCII and binary pattern matching. Non-Unicode - matching is generally more efficient. + By default Unicode patterns are matched. Option -U (--ascii or --binary) + disables Unicode matching for ASCII and binary pattern matching. Non- + Unicode matching is more efficient. ugrep accepts input of various encoding formats and normalizes the output to UTF-8. When a UTF byte order mark is present in the input, the input @@ -4149,8 +4145,8 @@ in markdown: --no-config Do not automatically load the default .ugrep configuration file. - --confirm - Confirm actions in -Q query TUI. The default is confirm. + --no-confirm + Do not confirm actions in -Q query TUI. The default is confirm. --cpp Output file matches in C++. See also options --format and -u. @@ -4607,8 +4603,8 @@ in markdown: select a file to search. Press Enter to select lines to output. Press ALT-l for option -l to list files, ALT-n for -n, etc. Non-option commands include ALT-] to increase context and ALT-} to - increase fuzzyness. See also options --confirm, --delay, --split - and --view. + increase fuzzyness. See also options --no-confirm, --delay, + --split and --view. -q, --quiet, --silent Quiet mode: suppress all output. Only search a file until a match @@ -4752,13 +4748,8 @@ in markdown: -w, --word-regexp The PATTERN is searched for as a word, such that the matching text is preceded by a non-word character and is followed by a non-word - character. Word characters are letters, digits and the - underscore. With option -P, word characters are Unicode letters, - digits and underscore. This option has no effect if -x is also - specified. If a PATTERN is specified, or -e PATTERN or -N - PATTERN, then this option has no effect on -f FILE patterns to - allow -f FILE patterns to narrow or widen the scope of the PATTERN - search. + character. Word-like characters are Unicode letters, digits and + connector punctuations such as underscore. --width[=NUM] Truncate the output to NUM visible characters per line. The width @@ -4773,10 +4764,7 @@ in markdown: -x, --line-regexp Select only those matches that exactly match the whole line, as if - the patterns are surrounded by ^ and $. If a PATTERN is - specified, or -e PATTERN or -N PATTERN, then this option has no - effect on -f FILE patterns to allow -f FILE patterns to narrow or - widen the scope of the PATTERN search. + the patterns are surrounded by ^ and $. --xml Output file matches in XML. If -H, -n, -k, or -b is specified, additional values are output. See also options --format and -u. @@ -5265,10 +5253,6 @@ in markdown: $ ugrep -o '\w+' myfile.txt - List all ASCII words in a file: - - $ ugrep -o '[[:word:]]+' myfile.txt - List the laughing face emojis (Unicode code points U+1F600 to U+1F60F): $ ugrep -o '[\x{1F600}-\x{1F60F}]' myfile.txt @@ -5410,7 +5394,7 @@ in markdown: - ugrep 6.0.0 May 6, 2024 UGREP(1) + ugrep 6.1.0 June 3, 2024 UGREP(1) 🔝 [Back to table of contents](#toc) diff --git a/bin/linux_amd64/README.md b/bin/linux_amd64/README.md new file mode 100644 index 00000000..56753d00 --- /dev/null +++ b/bin/linux_amd64/README.md @@ -0,0 +1,6 @@ +Linux x64 static executables +============================ + +The executables are statically build on Ubuntu for Linux x64. + +The executables detect AVX2 automatically for optimized searching. diff --git a/bin/linux_amd64/ugrep b/bin/linux_amd64/ugrep new file mode 100755 index 00000000..4d7c6d43 Binary files /dev/null and b/bin/linux_amd64/ugrep differ diff --git a/bin/linux_amd64/ugrep-indexer b/bin/linux_amd64/ugrep-indexer new file mode 100755 index 00000000..a3e8d876 Binary files /dev/null and b/bin/linux_amd64/ugrep-indexer differ diff --git a/bin/linux_arm64/README.md b/bin/linux_arm64/README.md new file mode 100644 index 00000000..a16598ff --- /dev/null +++ b/bin/linux_arm64/README.md @@ -0,0 +1,6 @@ +Linux AArch64 static executables +================================ + +The executables are statically build on Ubuntu for Linux AArch64. + +The executables will not run on ARM supporting NEON only. diff --git a/bin/linux_arm64/ugrep b/bin/linux_arm64/ugrep new file mode 100755 index 00000000..abb8b71e Binary files /dev/null and b/bin/linux_arm64/ugrep differ diff --git a/bin/linux_arm64/ugrep-indexer b/bin/linux_arm64/ugrep-indexer new file mode 100755 index 00000000..01cb7de6 Binary files /dev/null and b/bin/linux_arm64/ugrep-indexer differ diff --git a/bin/win32/README.md b/bin/win32/README.md index d93fc2ea..a1963c5b 100644 --- a/bin/win32/README.md +++ b/bin/win32/README.md @@ -7,4 +7,6 @@ Windows 32 bit executables If you want ugrep to emulate GNU grep, then copy `ugrep.exe` to `grep.exe` and to `egrep.exe`, `fgrep.exe`. -The executables detect SSE2 and AVX2 automatically for optimized searching. +The executables expect and require CPUs supporting SSE2. + +The executables detect AVX2/AVX512BW automatically for optimized searching. diff --git a/bin/win32/ug.exe b/bin/win32/ug.exe index 55285fb6..2e524ac4 100755 Binary files a/bin/win32/ug.exe and b/bin/win32/ug.exe differ diff --git a/bin/win32/ugrep-indexer.exe b/bin/win32/ugrep-indexer.exe index b06fedc4..e8c144e0 100755 Binary files a/bin/win32/ugrep-indexer.exe and b/bin/win32/ugrep-indexer.exe differ diff --git a/bin/win32/ugrep.exe b/bin/win32/ugrep.exe index 55285fb6..2e524ac4 100755 Binary files a/bin/win32/ugrep.exe and b/bin/win32/ugrep.exe differ diff --git a/bin/win64/README.md b/bin/win64/README.md index febb8365..a1415ded 100644 --- a/bin/win64/README.md +++ b/bin/win64/README.md @@ -7,4 +7,4 @@ Windows 64 bit executables If you want ugrep to emulate GNU grep, then copy `ugrep.exe` to `grep.exe` and to `egrep.exe`, `fgrep.exe`. -The executables detect SSE2 and AVX2 automatically for optimized searching. +The executables detect AVX2/AVX512BW automatically for optimized searching. diff --git a/bin/win64/ug.exe b/bin/win64/ug.exe index 155fecba..43dbf679 100755 Binary files a/bin/win64/ug.exe and b/bin/win64/ug.exe differ diff --git a/bin/win64/ugrep-indexer.exe b/bin/win64/ugrep-indexer.exe index 8c2b16ee..b398f8c3 100755 Binary files a/bin/win64/ugrep-indexer.exe and b/bin/win64/ugrep-indexer.exe differ diff --git a/bin/win64/ugrep.exe b/bin/win64/ugrep.exe index 155fecba..43dbf679 100755 Binary files a/bin/win64/ugrep.exe and b/bin/win64/ugrep.exe differ diff --git a/configure b/configure index b88e4c79..2eb983c0 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for ugrep 6.0.0. +# Generated by GNU Autoconf 2.72 for ugrep 6.1.0. # # Report bugs to . # @@ -606,8 +606,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='ugrep' PACKAGE_TARNAME='ugrep' -PACKAGE_VERSION='6.0.0' -PACKAGE_STRING='ugrep 6.0.0' +PACKAGE_VERSION='6.1.0' +PACKAGE_STRING='ugrep 6.1.0' PACKAGE_BUGREPORT='https://github.com/Genivia/ugrep/issues' PACKAGE_URL='https://ugrep.com' @@ -805,6 +805,7 @@ enable_7zip with_bash_completion_dir with_fish_completion_dir with_zsh_completion_dir +enable_static with_grep_path with_grep_colors enable_auto_color @@ -1381,7 +1382,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures ugrep 6.0.0 to adapt to many kinds of systems. +'configure' configures ugrep 6.1.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1452,7 +1453,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of ugrep 6.0.0:";; + short | recursive ) echo "Configuration of ugrep 6.1.0:";; esac cat <<\_ACEOF @@ -1471,6 +1472,7 @@ Optional Features: --disable-neon disable NEON CPU extensions --disable-7zip to disable 7zip and no longer search .7z files (7z requires more memory and takes long to decompress) + --enable-static build static ugrep binaries --disable-auto-color disable automatic colors, otherwise colors are enabled by default --enable-color deprecated, use --disable-auto-color @@ -1628,7 +1630,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -ugrep configure 6.0.0 +ugrep configure 6.1.0 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -2182,7 +2184,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by ugrep $as_me 6.0.0, which was +It was created by ugrep $as_me 6.1.0, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3692,7 +3694,7 @@ fi # Define the identity of the package. PACKAGE='ugrep' - VERSION='6.0.0' + VERSION='6.1.0' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -11040,6 +11042,31 @@ else fi +################################################################################ +# Static or dynamic (default) linking +################################################################################ + +# Check whether --enable-static was given. +if test ${enable_static+y} +then : + enableval=$enable_static; enable_static=yes +else case e in #( + e) enable_static=no ;; +esac +fi + +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --enable-static" >&5 +printf %s "checking for --enable-static... " >&6; } +if test "x$enable_static" == "xyes"; then + CFLAGS="$CFLAGS -static" + LDFLAGS="$LDFLAGS -static" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } +else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } +fi + ################################################################################ # Installation preferences ################################################################################ @@ -11763,7 +11790,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by ugrep $as_me 6.0.0, which was +This file was extended by ugrep $as_me 6.1.0, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -11836,7 +11863,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -ugrep config.status 6.0.0 +ugrep config.status 6.1.0 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 3bdcc32f..c710eae5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ugrep],[6.0.0],[https://github.com/Genivia/ugrep/issues],[ugrep],[https://ugrep.com]) +AC_INIT([ugrep],[6.1.0],[https://github.com/Genivia/ugrep/issues],[ugrep],[https://ugrep.com]) AM_INIT_AUTOMAKE([foreign subdir-objects dist-xz no-dist-gzip]) AC_CONFIG_HEADERS([config.h]) AC_COPYRIGHT([Copyright (C) 2019-2024 Robert van Engelen, Genivia Inc.]) diff --git a/include/reflex/absmatcher.h b/include/reflex/absmatcher.h index 454b8461..bd8dbc71 100644 --- a/include/reflex/absmatcher.h +++ b/include/reflex/absmatcher.h @@ -154,11 +154,13 @@ class AbstractMatcher { A(false), N(false), W(false), + X(false), T(8) { } bool A; ///< accept any/all (?^X) negative patterns as Const::REDO accept index codes bool N; ///< nullable, find may return empty match (N/A to scan, split, matches) - bool W; ///< half-check for "whole words", check only left of \< and right of \> for non-word character + bool W; ///< reflex::Matcher matches whole words as if bound by \< and \> + bool X; ///< reflex::LineMatcher matches empty lines char T; ///< tab size, must be a power of 2, default is 8, for column count and indent \i, \j, and \k }; /// AbstractMatcher::Iterator class for scanning, searching, and splitting input character sequences. @@ -356,7 +358,8 @@ class AbstractMatcher { { opt_.A = false; // when true: accept any/all (?^X) negative patterns as Const::REDO accept index codes opt_.N = false; // when true: find may return empty match (N/A to scan, split, matches) - opt_.W = false; // when true: half-check for "whole words", check only left of \< and right of \> for non-word character + opt_.W = false; // when true: reflex::Matcher matches whole words as if bound by \< and \> + opt_.X = false; // when true: reflex::LineMatcher matches empty lines opt_.T = 8; // tab size 1, 2, 4, or 8 if (opt) { @@ -373,6 +376,9 @@ class AbstractMatcher { case 'W': opt_.W = true; break; + case 'X': + opt_.X = true; + break; case 'T': opt_.T = isdigit(*(s += (s[1] == '=') + 1)) ? static_cast(*s - '0') : 0; break; @@ -926,13 +932,13 @@ class AbstractMatcher { else if (got_ == '\n') got_ = Const::UNK; } - /// Returns true if this matcher matched text that begins a word. + /// Returns true if this matcher matched text that begins an ASCII word. inline bool at_bow() /// @returns true if this matcher matched text that begins a word { return !isword(got_) && isword(txt_ < buf_ + end_ ? static_cast(*txt_) : peek_more()); } - /// Returns true if this matcher matched text that ends a word. + /// Returns true if this matcher matched text that ends an ASCII word. inline bool at_eow() /// @returns true if this matcher matched text that ends a word { diff --git a/include/reflex/fuzzymatcher.h b/include/reflex/fuzzymatcher.h index 67cb8f49..b332edee 100644 --- a/include/reflex/fuzzymatcher.h +++ b/include/reflex/fuzzymatcher.h @@ -216,7 +216,7 @@ class FuzzyMatcher : public Matcher { bpt.sub = bpt.alt; } /// backtrack on a backtrack point to insert or substitute a pattern char, restoring current text char matched and errors. - const Pattern::Opcode *backtrack(BacktrackPoint& bpt, int& c1) + const Pattern::Opcode *backtrack(BacktrackPoint& bpt, int& ch) { // no more alternatives if (bpt.pc1 == NULL) @@ -263,16 +263,16 @@ class FuzzyMatcher : public Matcher { err_ = bpt.err; // restore pos in the input pos_ = (txt_ - buf_) + bpt.len; - // set c1 to previous char before pos, to eventually set c0 in match(method) + // set ch to previous char before pos if (pos_ > 0) - c1 = static_cast(buf_[pos_ - 1]); + ch = static_cast(buf_[pos_ - 1]); else - c1 = got_; + ch = got_; // substitute or insert a pattern char in the text? if (bpt.sub) { // try substituting a pattern char for a mismatching char in the text - DBGLOG("Substitute: jump to %u at pos %zu char %d (0x%x)", jump, pos_, c1, c1); + DBGLOG("Substitute: jump to %u at pos %zu char %d (0x%x)", jump, pos_, ch, ch); int c = get(); if (!bin_ && c != EOF) { @@ -297,7 +297,7 @@ class FuzzyMatcher : public Matcher { else if (del_) { // try inserting a pattern char in the text to match a missing char in the text - DBGLOG("Delete: jump to %u at pos %zu char %d (0x%x)", jump, pos_, c1, c1); + DBGLOG("Delete: jump to %u at pos %zu char %d (0x%x)", jump, pos_, ch, ch); bpt.sub = bpt.alt; ++bpt.pc1; } @@ -325,7 +325,7 @@ class FuzzyMatcher : public Matcher { col_ = 0; // count columns for indent matching #endif find: - int c1 = got_; + int ch = got_; bool bol = at_bol(); // at begin of line? #if !defined(WITH_NO_INDENT) redo: @@ -333,8 +333,12 @@ class FuzzyMatcher : public Matcher { lap_.resize(0); cap_ = 0; bool nul = method == Const::MATCH; - if (pat_->opc_ != NULL) + if (pat_->opc_ != NULL && (!opt_.W || at_wb())) { + // skip to next line and keep searching if matching on anchor ^ and not at begin of line + if (method == Const::FIND && pat_->bol_ && !bol) + if (skip('\n')) + goto scan; err_ = 0; uint8_t stack = 0; const Pattern::Opcode *pc = pat_->opc_; @@ -358,10 +362,14 @@ class FuzzyMatcher : public Matcher { switch (opcode >> 24) { case 0xFE: // TAKE - cap_ = Pattern::long_index_of(opcode); - cur_ = pos_; + int c; + if (!opt_.W || (c = peek(), at_we(c, pos_))) + { + cap_ = Pattern::long_index_of(opcode); + DBGLOG("Take: cap = %zu", cap_); + cur_ = pos_; + } ++pc; - DBGLOG("Take: cap = %zu", cap_); continue; case 0xFD: // REDO cap_ = Const::REDO; @@ -402,11 +410,10 @@ class FuzzyMatcher : public Matcher { } #endif } - if (c1 == EOF) + if (ch == EOF) break; - int c0 = c1; - c1 = get(); - DBGLOG("Get: c1 = %d (0x%x)", c1, c1); + ch = get(); + DBGLOG("Get: ch = %d (0x%x)", ch, ch); // to jump to longest sequence of matching metas jump = Pattern::Const::IMAX; while (true) @@ -419,18 +426,21 @@ class FuzzyMatcher : public Matcher { switch (opcode >> 24) { case 0xFE: // TAKE - cap_ = Pattern::long_index_of(opcode); - cur_ = pos_; - if (c1 != EOF) - --cur_; // must unget one char + if (!opt_.W || at_we(ch, pos_ - 1)) + { + cap_ = Pattern::long_index_of(opcode); + DBGLOG("Take: cap = %zu", cap_); + cur_ = pos_; + if (ch != EOF) + --cur_; // must unget one char + } opcode = *++pc; - DBGLOG("Take: cap = %zu", cap_); continue; case 0xFD: // REDO cap_ = Const::REDO; DBGLOG("Redo"); cur_ = pos_; - if (c1 != EOF) + if (ch != EOF) --cur_; // must unget one char opcode = *++pc; continue; @@ -448,7 +458,7 @@ class FuzzyMatcher : public Matcher { continue; #if !defined(WITH_NO_INDENT) case Pattern::META_DED - Pattern::META_MIN: - DBGLOG("DED? %d", c1); + DBGLOG("DED? %d", ch); if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && dedent()) { jump = Pattern::index_of(opcode); @@ -458,7 +468,7 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_IND - Pattern::META_MIN: - DBGLOG("IND? %d", c1); + DBGLOG("IND? %d", ch); if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && indent()) { jump = Pattern::index_of(opcode); @@ -481,8 +491,8 @@ class FuzzyMatcher : public Matcher { continue; #endif case Pattern::META_EOB - Pattern::META_MIN: - DBGLOG("EOB? %d", c1); - if (jump == Pattern::Const::IMAX && c1 == EOF) + DBGLOG("EOB? %d", ch); + if (jump == Pattern::Const::IMAX && ch == EOF) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -501,9 +511,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_EOL - Pattern::META_MIN: - DBGLOG("EOL? %d", c1); + DBGLOG("EOL? %d", ch); anc_ = true; - if (jump == Pattern::Const::IMAX && (c1 == EOF || c1 == '\n' || (c1 == '\r' && peek() == '\n'))) + if (jump == Pattern::Const::IMAX && (ch == EOF || ch == '\n' || (ch == '\r' && peek() == '\n'))) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -523,9 +533,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_EWE - Pattern::META_MIN: - DBGLOG("EWE? %d %d %d", c0, c1, isword(c0) && !isword(c1)); + DBGLOG("EWE? %d", at_ewe(ch)); anc_ = true; - if (jump == Pattern::Const::IMAX && (isword(c0) || opt_.W) && !isword(c1)) + if (jump == Pattern::Const::IMAX && at_ewe(ch)) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -534,9 +544,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_BWE - Pattern::META_MIN: - DBGLOG("BWE? %d %d %d", c0, c1, !isword(c0) && isword(c1)); + DBGLOG("BWE? %d", at_bwe(ch)); anc_ = true; - if (jump == Pattern::Const::IMAX && !isword(c0) && isword(c1)) + if (jump == Pattern::Const::IMAX && at_bwe(ch)) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -545,10 +555,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_EWB - Pattern::META_MIN: - DBGLOG("EWB? %d", at_eow()); + DBGLOG("EWB? %d", at_ewb()); anc_ = true; - if (jump == Pattern::Const::IMAX && isword(got_) && - !isword(static_cast(method == Const::SPLIT ? txt_[len_] : *txt_))) + if (jump == Pattern::Const::IMAX && at_ewb()) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -557,10 +566,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_BWB - Pattern::META_MIN: - DBGLOG("BWB? %d", at_bow()); + DBGLOG("BWB? %d", at_bwb()); anc_ = true; - if (jump == Pattern::Const::IMAX && !isword(got_) && - (opt_.W || isword(static_cast(method == Const::SPLIT ? txt_[len_] : *txt_)))) + if (jump == Pattern::Const::IMAX && at_bwb()) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -569,9 +577,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_NWE - Pattern::META_MIN: - DBGLOG("NWE? %d %d %d", c0, c1, isword(c0) == isword(c1)); + DBGLOG("NWE? %d", at_nwe(ch)); anc_ = true; - if (jump == Pattern::Const::IMAX && isword(c0) == isword(c1)) + if (jump == Pattern::Const::IMAX && at_nwe(ch)) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -580,10 +588,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_NWB - Pattern::META_MIN: - DBGLOG("NWB? %d %d", at_bow(), at_eow()); + DBGLOG("NWB? %d", at_nwb()); anc_ = true; - if (jump == Pattern::Const::IMAX && - isword(got_) == isword(static_cast(txt_[len_]))) + if (jump == Pattern::Const::IMAX && at_nwb()) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -592,9 +599,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_WBE - Pattern::META_MIN: - DBGLOG("WBE? %d %d %d", c0, c1, isword(c0) != isword(c1)); + DBGLOG("WBE? %d", at_wbe(ch)); anc_ = true; - if (jump == Pattern::Const::IMAX && isword(c0) != isword(c1)) + if (jump == Pattern::Const::IMAX && at_wbe(ch)) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -603,10 +610,9 @@ class FuzzyMatcher : public Matcher { opcode = *++pc; continue; case Pattern::META_WBB - Pattern::META_MIN: - DBGLOG("WBB? %d %d", at_bow(), at_eow()); + DBGLOG("WBB? %d", at_wbb()); anc_ = true; - if (jump == Pattern::Const::IMAX && - isword(got_) != isword(static_cast(txt_[len_]))) + if (jump == Pattern::Const::IMAX && at_wbb()) { jump = Pattern::index_of(opcode); if (jump == Pattern::Const::LONG) @@ -619,7 +625,7 @@ class FuzzyMatcher : public Matcher { continue; } } - else if (c1 != EOF && !Pattern::is_opcode_halt(opcode)) + else if (ch != EOF && !Pattern::is_opcode_halt(opcode)) { if (jump == Pattern::Const::IMAX) break; @@ -654,38 +660,38 @@ class FuzzyMatcher : public Matcher { opcode = *pc; jump = Pattern::Const::IMAX; } - if (c1 == EOF) + if (ch == EOF) break; } else { - if (c1 == EOF) + if (ch == EOF) break; - c1 = get(); + ch = get(); if (Pattern::is_opcode_halt(opcode)) { if (back != Pattern::Const::IMAX) { pos_ = (txt_ - buf_) + bpos; pc = pat_->opc_ + back; - DBGLOG("Backtrack: back = %u pos = %zu c1 = %d", back, pos_, c1); + DBGLOG("Backtrack: back = %u pos = %zu ch = %d", back, pos_, ch); back = Pattern::Const::IMAX; continue; } break; } - DBGLOG("Get: c1 = %d (0x%x) at pos %zu", c1, c1, pos_ - 1); - if (bin_ || (c1 & 0xC0) != 0x80 || c1 == EOF) + DBGLOG("Get: ch = %d (0x%x) at pos %zu", ch, ch, pos_ - 1); + if (bin_ || (ch & 0xC0) != 0x80 || ch == EOF) { // save backtrack point (DFA and relative position in the match) pc0 = pc; len0 = pos_ - (txt_ - buf_); } - if (c1 == EOF) + if (ch == EOF) break; } { - Pattern::Opcode lo = c1 << 24; + Pattern::Opcode lo = ch << 24; Pattern::Opcode hi = lo | 0x00FFFFFF; unrolled: if (hi < opcode || lo > (opcode << 8)) @@ -749,7 +755,7 @@ class FuzzyMatcher : public Matcher { { pc = pat_->opc_ + back; pos_ = (txt_ - buf_) + bpos; - DBGLOG("Backtrack: back = %u pos = %zu c1 = %d", back, pos_, c1); + DBGLOG("Backtrack: back = %u pos = %zu ch = %d", back, pos_, ch); back = Pattern::Const::IMAX; continue; } @@ -768,27 +774,27 @@ class FuzzyMatcher : public Matcher { // exit fuzzy loop if fuzzy match succeeds till end of input when insertions are allowed if (cap_ > 0) { - if (c1 != EOF && ins_) + if (ch != EOF && ins_) { // text insertions are allowed while (err_ < max_) { ++err_; - c1 = get(); + ch = get(); // reached the end? - if (c1 == EOF) + if (ch == EOF) break; // skip one (multibyte) char - if (!bin_ && c1 >= 0xC0) + if (!bin_ && ch >= 0xC0) { - int n = (c1 >= 0xE0) + (c1 >= 0xF0); + int n = (ch >= 0xE0) + (ch >= 0xF0); while (n-- >= 0) - if ((c1 = get()) == EOF) + if ((ch = get()) == EOF) break; } } } - if (c1 == EOF || ins_) + if (ch == EOF || ins_) { // reached the end? if (at_end()) @@ -808,7 +814,7 @@ class FuzzyMatcher : public Matcher { break; } // no match, use fuzzy matching with max error - if (c1 == '\0' || c1 == '\n' || c1 == EOF) + if (ch == '\0' || ch == '\n' || ch == EOF) { // do not try to fuzzy match NUL, LF, or EOF if (err_ < max_ && del_) @@ -817,7 +823,7 @@ class FuzzyMatcher : public Matcher { // set backtrack point to insert pattern char only, not substitute, if pc0 os a different point than the last if (stack == 0 || bpt_[stack - 1].pc0 != pc0) { - point(bpt_[stack++], pc0, len0, false, c1 == EOF); + point(bpt_[stack++], pc0, len0, false, ch == EOF); DBGLOG("Point[%u] at %zu pos %zu (\\0|\\nEOF)", stack - 1, pc0 - pat_->opc_, pos_ - 1); } } @@ -827,7 +833,7 @@ class FuzzyMatcher : public Matcher { pc = NULL; while (stack > 0 && pc == NULL) { - pc = backtrack(bpt_[stack - 1], c1); + pc = backtrack(bpt_[stack - 1], ch); if (pc == NULL) --stack; } @@ -855,22 +861,22 @@ class FuzzyMatcher : public Matcher { if (!bin_) { // try pattern char deletion (text insertion): skip one (multibyte) char then rerun opcode at pc0 - if (c1 >= 0xC0) + if (ch >= 0xC0) { - int n = (c1 >= 0xE0) + (c1 >= 0xF0); + int n = (ch >= 0xE0) + (ch >= 0xF0); while (n-- >= 0) - if ((c1 = get()) == EOF) + if ((ch = get()) == EOF) break; } else { while ((peek() & 0xC0) == 0x80) - if ((c1 = get()) == EOF) + if ((ch = get()) == EOF) break; } } pc = pc0; - DBGLOG("Insert: %d (0x%x) at pos %zu", c1, c1, pos_ - 1); + DBGLOG("Insert: %d (0x%x) at pos %zu", ch, ch, pos_ - 1); } } else @@ -879,7 +885,7 @@ class FuzzyMatcher : public Matcher { pc = NULL; while (stack > 0 && pc == NULL) { - pc = backtrack(bpt_[stack - 1], c1); + pc = backtrack(bpt_[stack - 1], ch); if (pc == NULL) --stack; } @@ -1125,7 +1131,7 @@ class FuzzyMatcher : public Matcher { // skip one char to keep searching set_current(++cur_); // allow FIND with "N" to match an empty line, with ^$ etc. - if (cap_ == 0 || !opt_.N || (!bol && (c1 == '\n' || (c1 == '\r' && peek() == '\n')))) + if (cap_ == 0 || !opt_.N || (!bol && (ch == '\n' || (ch == '\r' && peek() == '\n')))) goto scan; DBGLOG("Accept empty match"); } diff --git a/include/reflex/linematcher.h b/include/reflex/linematcher.h index 3ab7f635..cf222fe5 100644 --- a/include/reflex/linematcher.h +++ b/include/reflex/linematcher.h @@ -138,8 +138,8 @@ class LineMatcher : public AbstractMatcher { // option N also finds empty lines if (n == 0 && !opt_.N) goto find; - // option W only finds empty lines - if (n > 0 && opt_.W) + // option X only finds empty lines + if (n > 0 && opt_.X) goto find; break; case Const::SPLIT: diff --git a/include/reflex/matcher.h b/include/reflex/matcher.h index 1c254df7..e92766c3 100644 --- a/include/reflex/matcher.h +++ b/include/reflex/matcher.h @@ -275,9 +275,9 @@ class Matcher : public PatternMatcher { stk_.pop(); } /// FSM code INIT. - inline void FSM_INIT(int& c1) + inline void FSM_INIT(int& c) { - c1 = fsm_.c1; + c = fsm_.ch; } /// FSM code FIND. inline void FSM_FIND() @@ -295,23 +295,30 @@ class Matcher : public PatternMatcher { return get(); } /// FSM code HALT. - inline void FSM_HALT(int c1 = AbstractMatcher::Const::UNK) + inline void FSM_HALT(int c = AbstractMatcher::Const::UNK) { - fsm_.c1 = c1; + fsm_.ch = c; } /// FSM code TAKE. inline void FSM_TAKE(Pattern::Accept cap) { - cap_ = cap; - cur_ = pos_; + int ch = peek(); + if (!opt_.W || at_we(ch, pos_)) + { + cap_ = cap; + cur_ = pos_; + } } /// FSM code TAKE. - inline void FSM_TAKE(Pattern::Accept cap, int c1) + inline void FSM_TAKE(Pattern::Accept cap, int c) { - cap_ = cap; - cur_ = pos_; - if (c1 != EOF) - --cur_; + if (!opt_.W || at_we(c, pos_ - 1)) + { + cap_ = cap; + cur_ = pos_; + if (c != EOF) + --cur_; + } } /// FSM code REDO. inline void FSM_REDO() @@ -320,11 +327,11 @@ class Matcher : public PatternMatcher { cur_ = pos_; } /// FSM code REDO. - inline void FSM_REDO(int c1) + inline void FSM_REDO(int c) { cap_ = Const::REDO; cur_ = pos_; - if (c1 != EOF) + if (c != EOF) --cur_; } /// FSM code HEAD. @@ -381,9 +388,9 @@ class Matcher : public PatternMatcher { } #endif /// FSM code META EOB. - inline bool FSM_META_EOB(int c1) + inline bool FSM_META_EOB(int c) { - return c1 == EOF; + return c == EOF; } /// FSM code META BOB. inline bool FSM_META_BOB() @@ -391,10 +398,10 @@ class Matcher : public PatternMatcher { return at_bob(); } /// FSM code META EOL. - inline bool FSM_META_EOL(int c1) + inline bool FSM_META_EOL(int c) { anc_ = true; - return c1 == EOF || c1 == '\n' || (c1 == '\r' && peek() == '\n'); + return c == EOF || c == '\n' || (c == '\r' && peek() == '\n'); } /// FSM code META BOL. inline bool FSM_META_BOL() @@ -403,69 +410,933 @@ class Matcher : public PatternMatcher { return fsm_.bol; } /// FSM code META EWE. - inline bool FSM_META_EWE(int c0, int c1) + inline bool FSM_META_EWE(int c) { anc_ = true; - return (isword(c0) || opt_.W) && !isword(c1); + return at_ewe(c); } /// FSM code META BWE. - inline bool FSM_META_BWE(int c0, int c1) + inline bool FSM_META_BWE(int c) { anc_ = true; - return !isword(c0) && isword(c1); + return at_bwe(c); } /// FSM code META EWB. inline bool FSM_META_EWB() { anc_ = true; - return isword(got_) && !isword(static_cast(txt_[len_])); + return at_ewb(); } /// FSM code META BWB. inline bool FSM_META_BWB() { anc_ = true; - return !isword(got_) && (opt_.W || isword(static_cast(txt_[len_]))); + return at_bwb(); } /// FSM code META NWE. - inline bool FSM_META_NWE(int c0, int c1) + inline bool FSM_META_NWE(int c) { anc_ = true; - return isword(c0) == isword(c1); + return at_nwe(c); } /// FSM code META NWB. inline bool FSM_META_NWB() { anc_ = true; - return isword(got_) == isword(static_cast(txt_[len_])); + return at_nwb(); } /// FSM code META WBE. - inline bool FSM_META_WBE(int c0, int c1) + inline bool FSM_META_WBE(int c) { anc_ = true; - return isword(c0) != isword(c1); + return at_wbe(c); } /// FSM code META WBB. inline bool FSM_META_WBB() { anc_ = true; - return isword(got_) != isword(static_cast(txt_[len_])); + return at_wbb(); } protected: typedef std::vector Stops; ///< indent margin/tab stops /// FSM data for FSM code struct FSM { - FSM() : bol(), nul(), c1() { } + FSM() : bol(), nul(), ch() { } bool bol; bool nul; - int c1; + int ch; }; + /// Return true if Unicode word character. + static bool iswword(int c) ///< character to test + { + // table source: unicode/language_scripts.cpp Word[] array updated to Unicode 15.1 + static const int word[2*712] = { + 48, 57, + 65, 90, + 95, 95, + 97, 122, + 170, 170, + 181, 181, + 186, 186, + 192, 214, + 216, 246, + 248, 705, + 710, 721, + 736, 740, + 748, 748, + 750, 750, + 880, 884, + 886, 887, + 890, 893, + 895, 895, + 902, 902, + 904, 906, + 908, 908, + 910, 929, + 931, 1013, + 1015, 1153, + 1162, 1327, + 1329, 1366, + 1369, 1369, + 1376, 1416, + 1488, 1514, + 1519, 1522, + 1568, 1610, + 1632, 1641, + 1646, 1647, + 1649, 1747, + 1749, 1749, + 1765, 1766, + 1774, 1788, + 1791, 1791, + 1808, 1808, + 1810, 1839, + 1869, 1957, + 1969, 1969, + 1984, 2026, + 2036, 2037, + 2042, 2042, + 2048, 2069, + 2074, 2074, + 2084, 2084, + 2088, 2088, + 2112, 2136, + 2144, 2154, + 2160, 2183, + 2185, 2190, + 2208, 2249, + 2308, 2361, + 2365, 2365, + 2384, 2384, + 2392, 2401, + 2406, 2415, + 2417, 2432, + 2437, 2444, + 2447, 2448, + 2451, 2472, + 2474, 2480, + 2482, 2482, + 2486, 2489, + 2493, 2493, + 2510, 2510, + 2524, 2525, + 2527, 2529, + 2534, 2545, + 2556, 2556, + 2565, 2570, + 2575, 2576, + 2579, 2600, + 2602, 2608, + 2610, 2611, + 2613, 2614, + 2616, 2617, + 2649, 2652, + 2654, 2654, + 2662, 2671, + 2674, 2676, + 2693, 2701, + 2703, 2705, + 2707, 2728, + 2730, 2736, + 2738, 2739, + 2741, 2745, + 2749, 2749, + 2768, 2768, + 2784, 2785, + 2790, 2799, + 2809, 2809, + 2821, 2828, + 2831, 2832, + 2835, 2856, + 2858, 2864, + 2866, 2867, + 2869, 2873, + 2877, 2877, + 2908, 2909, + 2911, 2913, + 2918, 2927, + 2929, 2929, + 2947, 2947, + 2949, 2954, + 2958, 2960, + 2962, 2965, + 2969, 2970, + 2972, 2972, + 2974, 2975, + 2979, 2980, + 2984, 2986, + 2990, 3001, + 3024, 3024, + 3046, 3055, + 3077, 3084, + 3086, 3088, + 3090, 3112, + 3114, 3129, + 3133, 3133, + 3160, 3162, + 3165, 3165, + 3168, 3169, + 3174, 3183, + 3200, 3200, + 3205, 3212, + 3214, 3216, + 3218, 3240, + 3242, 3251, + 3253, 3257, + 3261, 3261, + 3293, 3294, + 3296, 3297, + 3302, 3311, + 3313, 3314, + 3332, 3340, + 3342, 3344, + 3346, 3386, + 3389, 3389, + 3406, 3406, + 3412, 3414, + 3423, 3425, + 3430, 3439, + 3450, 3455, + 3461, 3478, + 3482, 3505, + 3507, 3515, + 3517, 3517, + 3520, 3526, + 3558, 3567, + 3585, 3632, + 3634, 3635, + 3648, 3654, + 3664, 3673, + 3713, 3714, + 3716, 3716, + 3718, 3722, + 3724, 3747, + 3749, 3749, + 3751, 3760, + 3762, 3763, + 3773, 3773, + 3776, 3780, + 3782, 3782, + 3792, 3801, + 3804, 3807, + 3840, 3840, + 3872, 3881, + 3904, 3911, + 3913, 3948, + 3976, 3980, + 4096, 4138, + 4159, 4169, + 4176, 4181, + 4186, 4189, + 4193, 4193, + 4197, 4198, + 4206, 4208, + 4213, 4225, + 4238, 4238, + 4240, 4249, + 4256, 4293, + 4295, 4295, + 4301, 4301, + 4304, 4346, + 4348, 4680, + 4682, 4685, + 4688, 4694, + 4696, 4696, + 4698, 4701, + 4704, 4744, + 4746, 4749, + 4752, 4784, + 4786, 4789, + 4792, 4798, + 4800, 4800, + 4802, 4805, + 4808, 4822, + 4824, 4880, + 4882, 4885, + 4888, 4954, + 4992, 5007, + 5024, 5109, + 5112, 5117, + 5121, 5740, + 5743, 5759, + 5761, 5786, + 5792, 5866, + 5873, 5880, + 5888, 5905, + 5919, 5937, + 5952, 5969, + 5984, 5996, + 5998, 6000, + 6016, 6067, + 6103, 6103, + 6108, 6108, + 6112, 6121, + 6160, 6169, + 6176, 6264, + 6272, 6276, + 6279, 6312, + 6314, 6314, + 6320, 6389, + 6400, 6430, + 6470, 6509, + 6512, 6516, + 6528, 6571, + 6576, 6601, + 6608, 6617, + 6656, 6678, + 6688, 6740, + 6784, 6793, + 6800, 6809, + 6823, 6823, + 6917, 6963, + 6981, 6988, + 6992, 7001, + 7043, 7072, + 7086, 7141, + 7168, 7203, + 7232, 7241, + 7245, 7293, + 7296, 7304, + 7312, 7354, + 7357, 7359, + 7401, 7404, + 7406, 7411, + 7413, 7414, + 7418, 7418, + 7424, 7615, + 7680, 7957, + 7960, 7965, + 7968, 8005, + 8008, 8013, + 8016, 8023, + 8025, 8025, + 8027, 8027, + 8029, 8029, + 8031, 8061, + 8064, 8116, + 8118, 8124, + 8126, 8126, + 8130, 8132, + 8134, 8140, + 8144, 8147, + 8150, 8155, + 8160, 8172, + 8178, 8180, + 8182, 8188, + 8255, 8256, + 8276, 8276, + 8305, 8305, + 8319, 8319, + 8336, 8348, + 8450, 8450, + 8455, 8455, + 8458, 8467, + 8469, 8469, + 8473, 8477, + 8484, 8484, + 8486, 8486, + 8488, 8488, + 8490, 8493, + 8495, 8505, + 8508, 8511, + 8517, 8521, + 8526, 8526, + 8579, 8580, + 11264, 11492, + 11499, 11502, + 11506, 11507, + 11520, 11557, + 11559, 11559, + 11565, 11565, + 11568, 11623, + 11631, 11631, + 11648, 11670, + 11680, 11686, + 11688, 11694, + 11696, 11702, + 11704, 11710, + 11712, 11718, + 11720, 11726, + 11728, 11734, + 11736, 11742, + 11823, 11823, + 12293, 12294, + 12337, 12341, + 12347, 12348, + 12353, 12438, + 12445, 12447, + 12449, 12538, + 12540, 12543, + 12549, 12591, + 12593, 12686, + 12704, 12735, + 12784, 12799, + 13312, 19903, + 19968, 42124, + 42192, 42237, + 42240, 42508, + 42512, 42539, + 42560, 42606, + 42623, 42653, + 42656, 42725, + 42775, 42783, + 42786, 42888, + 42891, 42954, + 42960, 42961, + 42963, 42963, + 42965, 42969, + 42994, 43009, + 43011, 43013, + 43015, 43018, + 43020, 43042, + 43072, 43123, + 43138, 43187, + 43216, 43225, + 43250, 43255, + 43259, 43259, + 43261, 43262, + 43264, 43301, + 43312, 43334, + 43360, 43388, + 43396, 43442, + 43471, 43481, + 43488, 43492, + 43494, 43518, + 43520, 43560, + 43584, 43586, + 43588, 43595, + 43600, 43609, + 43616, 43638, + 43642, 43642, + 43646, 43695, + 43697, 43697, + 43701, 43702, + 43705, 43709, + 43712, 43712, + 43714, 43714, + 43739, 43741, + 43744, 43754, + 43762, 43764, + 43777, 43782, + 43785, 43790, + 43793, 43798, + 43808, 43814, + 43816, 43822, + 43824, 43866, + 43868, 43881, + 43888, 44002, + 44016, 44025, + 44032, 55203, + 55216, 55238, + 55243, 55291, + 63744, 64109, + 64112, 64217, + 64256, 64262, + 64275, 64279, + 64285, 64285, + 64287, 64296, + 64298, 64310, + 64312, 64316, + 64318, 64318, + 64320, 64321, + 64323, 64324, + 64326, 64433, + 64467, 64829, + 64848, 64911, + 64914, 64967, + 65008, 65019, + 65075, 65076, + 65101, 65103, + 65136, 65140, + 65142, 65276, + 65296, 65305, + 65313, 65338, + 65343, 65343, + 65345, 65370, + 65382, 65470, + 65474, 65479, + 65482, 65487, + 65490, 65495, + 65498, 65500, + 65536, 65547, + 65549, 65574, + 65576, 65594, + 65596, 65597, + 65599, 65613, + 65616, 65629, + 65664, 65786, + 66176, 66204, + 66208, 66256, + 66304, 66335, + 66349, 66368, + 66370, 66377, + 66384, 66421, + 66432, 66461, + 66464, 66499, + 66504, 66511, + 66560, 66717, + 66720, 66729, + 66736, 66771, + 66776, 66811, + 66816, 66855, + 66864, 66915, + 66928, 66938, + 66940, 66954, + 66956, 66962, + 66964, 66965, + 66967, 66977, + 66979, 66993, + 66995, 67001, + 67003, 67004, + 67072, 67382, + 67392, 67413, + 67424, 67431, + 67456, 67461, + 67463, 67504, + 67506, 67514, + 67584, 67589, + 67592, 67592, + 67594, 67637, + 67639, 67640, + 67644, 67644, + 67647, 67669, + 67680, 67702, + 67712, 67742, + 67808, 67826, + 67828, 67829, + 67840, 67861, + 67872, 67897, + 67968, 68023, + 68030, 68031, + 68096, 68096, + 68112, 68115, + 68117, 68119, + 68121, 68149, + 68192, 68220, + 68224, 68252, + 68288, 68295, + 68297, 68324, + 68352, 68405, + 68416, 68437, + 68448, 68466, + 68480, 68497, + 68608, 68680, + 68736, 68786, + 68800, 68850, + 68864, 68899, + 68912, 68921, + 69248, 69289, + 69296, 69297, + 69376, 69404, + 69415, 69415, + 69424, 69445, + 69488, 69505, + 69552, 69572, + 69600, 69622, + 69635, 69687, + 69734, 69743, + 69745, 69746, + 69749, 69749, + 69763, 69807, + 69840, 69864, + 69872, 69881, + 69891, 69926, + 69942, 69951, + 69956, 69956, + 69959, 69959, + 69968, 70002, + 70006, 70006, + 70019, 70066, + 70081, 70084, + 70096, 70106, + 70108, 70108, + 70144, 70161, + 70163, 70187, + 70207, 70208, + 70272, 70278, + 70280, 70280, + 70282, 70285, + 70287, 70301, + 70303, 70312, + 70320, 70366, + 70384, 70393, + 70405, 70412, + 70415, 70416, + 70419, 70440, + 70442, 70448, + 70450, 70451, + 70453, 70457, + 70461, 70461, + 70480, 70480, + 70493, 70497, + 70656, 70708, + 70727, 70730, + 70736, 70745, + 70751, 70753, + 70784, 70831, + 70852, 70853, + 70855, 70855, + 70864, 70873, + 71040, 71086, + 71128, 71131, + 71168, 71215, + 71236, 71236, + 71248, 71257, + 71296, 71338, + 71352, 71352, + 71360, 71369, + 71424, 71450, + 71472, 71481, + 71488, 71494, + 71680, 71723, + 71840, 71913, + 71935, 71942, + 71945, 71945, + 71948, 71955, + 71957, 71958, + 71960, 71983, + 71999, 71999, + 72001, 72001, + 72016, 72025, + 72096, 72103, + 72106, 72144, + 72161, 72161, + 72163, 72163, + 72192, 72192, + 72203, 72242, + 72250, 72250, + 72272, 72272, + 72284, 72329, + 72349, 72349, + 72368, 72440, + 72704, 72712, + 72714, 72750, + 72768, 72768, + 72784, 72793, + 72818, 72847, + 72960, 72966, + 72968, 72969, + 72971, 73008, + 73030, 73030, + 73040, 73049, + 73056, 73061, + 73063, 73064, + 73066, 73097, + 73112, 73112, + 73120, 73129, + 73440, 73458, + 73474, 73474, + 73476, 73488, + 73490, 73523, + 73552, 73561, + 73648, 73648, + 73728, 74649, + 74880, 75075, + 77712, 77808, + 77824, 78895, + 78913, 78918, + 82944, 83526, + 92160, 92728, + 92736, 92766, + 92768, 92777, + 92784, 92862, + 92864, 92873, + 92880, 92909, + 92928, 92975, + 92992, 92995, + 93008, 93017, + 93027, 93047, + 93053, 93071, + 93760, 93823, + 93952, 94026, + 94032, 94032, + 94099, 94111, + 94176, 94177, + 94179, 94179, + 94208, 100343, + 100352, 101589, + 101632, 101640, + 110576, 110579, + 110581, 110587, + 110589, 110590, + 110592, 110882, + 110898, 110898, + 110928, 110930, + 110933, 110933, + 110948, 110951, + 110960, 111355, + 113664, 113770, + 113776, 113788, + 113792, 113800, + 113808, 113817, + 119808, 119892, + 119894, 119964, + 119966, 119967, + 119970, 119970, + 119973, 119974, + 119977, 119980, + 119982, 119993, + 119995, 119995, + 119997, 120003, + 120005, 120069, + 120071, 120074, + 120077, 120084, + 120086, 120092, + 120094, 120121, + 120123, 120126, + 120128, 120132, + 120134, 120134, + 120138, 120144, + 120146, 120485, + 120488, 120512, + 120514, 120538, + 120540, 120570, + 120572, 120596, + 120598, 120628, + 120630, 120654, + 120656, 120686, + 120688, 120712, + 120714, 120744, + 120746, 120770, + 120772, 120779, + 120782, 120831, + 122624, 122654, + 122661, 122666, + 122928, 122989, + 123136, 123180, + 123191, 123197, + 123200, 123209, + 123214, 123214, + 123536, 123565, + 123584, 123627, + 123632, 123641, + 124112, 124139, + 124144, 124153, + 124896, 124902, + 124904, 124907, + 124909, 124910, + 124912, 124926, + 124928, 125124, + 125184, 125251, + 125259, 125259, + 125264, 125273, + 126464, 126467, + 126469, 126495, + 126497, 126498, + 126500, 126500, + 126503, 126503, + 126505, 126514, + 126516, 126519, + 126521, 126521, + 126523, 126523, + 126530, 126530, + 126535, 126535, + 126537, 126537, + 126539, 126539, + 126541, 126543, + 126545, 126546, + 126548, 126548, + 126551, 126551, + 126553, 126553, + 126555, 126555, + 126557, 126557, + 126559, 126559, + 126561, 126562, + 126564, 126564, + 126567, 126570, + 126572, 126578, + 126580, 126583, + 126585, 126588, + 126590, 126590, + 126592, 126601, + 126603, 126619, + 126625, 126627, + 126629, 126633, + 126635, 126651, + 130032, 130041, + 131072, 173791, + 173824, 177977, + 177984, 178205, + 178208, 183969, + 183984, 191456, + 191472, 192093, + 194560, 195101, + 196608, 201546, + 201552, 205743, + }; + static const uint16_t num = sizeof(word) / sizeof(int) / 2; + uint16_t min = 0; + uint16_t max = num - 1; + // binary search in table + if (c >= word[0] && c <= word[2 * num - 1]) + { + while (max >= min) + { + uint16_t mid = (min + max) / 2; + if (c < word[2 * mid]) + max = mid - 1; + else if (c > word[2 * mid + 1]) + min = mid + 1; + else + return true; + } + } + return false; + } + /// Check if a word begins before a match. + inline bool at_wb() + { +#if WITH_SPAN + int c = got_; + if (c == Const::BOB || c == Const::UNK || c == '\n') + return true; + if (c == '_') + return false; + if ((c & 0xc0) == 0x80 && cur_ > 0) + { + size_t k = cur_ - 1; + if (k > 0 && (buf_[--k] & 0xc0) == 0x80) + if (k > 0 && (buf_[--k] & 0xc0) == 0x80) + if (k > 0) + --k; + c = utf8(&buf_[k]); + return !iswword(c); + } + return !std::isalnum(static_cast(c)); +#else + return !isword(got_); +#endif + } + /// Check if a word ends after the match. + inline bool at_we( + int c, ///< character after the match + size_t k) ///< position in the buffer of the character after the match + { +#if WITH_SPAN + if (c == EOF) + return true; + if (c == '_') + return false; + if ((c & 0xc0) == 0xc0) + { + c = utf8(&buf_[k]); + return !iswword(c); + } + return !std::isalnum(static_cast(c)); +#else + (void)k; + return !isword(c); +#endif + } + /// Check if match begins a word (after split with len_ > 0 or len_ = 0 for find). + inline bool at_bw() + { +#if WITH_SPAN + int c = static_cast(txt_[len_]); + if (c == '_') + return true; + if ((c & 0xc0) == 0xc0) + { + c = utf8(&txt_[len_]); + return iswword(c); + } + return std::isalnum(static_cast(c)); +#else + return isword(static_cast(txt_[len_])) +#endif + } + /// Check if match ends a word. + inline bool at_ew(int c) + { + size_t k = pos_ + (c == EOF); + c = k > 1 ? static_cast(buf_[k - 2]) : got_; +#if WITH_SPAN + if (c == Const::BOB || c == Const::UNK || c == '\n') + return false; + if (c == '_') + return true; + if ((c & 0xc0) == 0x80 && k > 2) + { + k -= 3; + if ((buf_[k] & 0xc0) == 0x80) + if (k > 0 && (buf_[--k] & 0xc0) == 0x80) + if (k > 0) + --k; + c = utf8(&buf_[k]); + return iswword(c); + } + return std::isalnum(static_cast(c)); +#else + return isword(c); +#endif + } + /// Check end of word at match end boundary MATCH\>. + inline bool at_ewe(int c) ///< character last read with get() + { + return at_we(c, pos_) && at_ew(c); + } + /// Check begin of word at match end boundary MATCH\<. + inline bool at_bwe(int c) ///< character last read with get() + { + return !at_we(c, pos_) && !at_ew(c); + } + /// Check end of word at match begin boundary \>MATCH (after split with len_ > 0 or len_ = 0 for find). + inline bool at_ewb() + { + return !at_bw() && !at_wb(); + } + /// Check begin of word at match begin boundary \ 0 or len_ = 0 for find). + inline bool at_bwb() + { + return at_bw() && at_wb(); + } + /// Check not a word boundary at match end MATCH\B. + inline bool at_nwe(int c) ///< character last read with get() + { + return at_we(c, pos_) != at_ew(c); + } + /// Check not a word boundary at match begin \BMATCH (after split with len_ > 0 or len_ = 0 for find). + inline bool at_nwb() + { + return at_bw() != at_wb(); + } + /// Check word boundary at match end MATCH\b. + inline bool at_wbe(int c) ///< character last read with get() + { + return at_we(c, pos_) == at_ew(c); + } + /// Check word boundary at match begin \bMATCH (after split with len_ > 0 or len_ = 0 for find). + inline bool at_wbb() + { + return at_bw() == at_wb(); + } /// Returns true if input matched the pattern using method Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH. virtual size_t match(Method method) ///< Const::SCAN, Const::FIND, Const::SPLIT, or Const::MATCH /// @returns nonzero if input matched the pattern ; - // match() with optimized AVX512BW string search scheme defined in matcher_avx512bw.cpp + /// match() with optimized AVX512BW string search scheme defined in matcher_avx512bw.cpp size_t simd_match_avx512bw(Method method); - // match() with optimized AVX2 string search scheme defined in matcher_avx2.cpp + /// match() with optimized AVX2 string search scheme defined in matcher_avx2.cpp size_t simd_match_avx2(Method method); /// Initialize specialized (+ SSE2/NEON) pattern search methods to advance the engine to a possible match void init_advance(); diff --git a/include/reflex/pattern.h b/include/reflex/pattern.h index 5591032f..4d534885 100644 --- a/include/reflex/pattern.h +++ b/include/reflex/pattern.h @@ -941,8 +941,7 @@ class Pattern { void check_dfa_closure( const DFA::State *state, int nest, - bool& peek, - bool& prev) const; + bool& peek) const; void gencode_dfa_closure( FILE *fd, const DFA::State *start, @@ -1192,6 +1191,7 @@ class Pattern { float ams_; ///< ms elapsed time to analyze DFA for predict match and HFA size_t npy_; ///< entropy derived from the bitap array bit_[] bool one_; ///< true if matching one string stored in chr_[] without meta/anchors + bool bol_; ///< true if matching all patterns at the begin of a line with anchor ^ }; } // namespace reflex diff --git a/lib/convert.cpp b/lib/convert.cpp index 206236bc..f4589c78 100644 --- a/lib/convert.cpp +++ b/lib/convert.cpp @@ -778,7 +778,11 @@ static void insert_posix_class(const char *pattern, size_t len, size_t& pos, con else if (name[0] == 'A' && name[1] == 's') name = const_cast("ASCII"); } - const int *wc = Posix::range(name); + const int *wc = NULL; + if ((flags & convert_flag::unicode)) + wc = Unicode::range(name); + if (wc == NULL) + wc = Posix::range(name); if (wc == NULL) throw regex_error(regex_error::invalid_class, pattern, pos); if (*buf == '^') diff --git a/lib/language_scripts.cpp b/lib/language_scripts.cpp index 89f57e51..496a4a42 100644 --- a/lib/language_scripts.cpp +++ b/lib/language_scripts.cpp @@ -2107,6 +2107,36 @@ void reflex::Unicode::Tables::language_scripts(void) 0, 0 }; range["Grantha"] = Grantha; + static const int Graph[] = { + 33, 126, + 161, 172, + 174, 1535, + 1542, 1563, + 1565, 1756, + 1758, 1806, + 1808, 2191, + 2194, 2273, + 2275, 5759, + 5761, 6157, + 6159, 8191, + 8208, 8231, + 8240, 8286, + 8293, 8293, + 8304, 12287, + 12289, 55295, + 57344, 65278, + 65280, 65528, + 65532, 69820, + 69822, 69836, + 69838, 78895, + 78912, 113823, + 113828, 119154, + 119163, 917504, + 917506, 917535, + 917632, 1114111, + 0, 0 + }; + range["Graph"] = Graph; static const int Greek[] = { 880, 883, 885, 887, @@ -7430,6 +7460,34 @@ void reflex::Unicode::Tables::language_scripts(void) 0, 0 }; range["Po"] = Po; + static const int Print[] = { + 32, 126, + 160, 172, + 174, 1535, + 1542, 1563, + 1565, 1756, + 1758, 1806, + 1808, 2191, + 2194, 2273, + 2275, 6157, + 6159, 8202, + 8208, 8233, + 8239, 8287, + 8293, 8293, + 8304, 55295, + 57344, 65278, + 65280, 65528, + 65532, 69820, + 69822, 69836, + 69838, 78895, + 78912, 113823, + 113828, 119154, + 119163, 917504, + 917506, 917535, + 917632, 1114111, + 0, 0 + }; + range["Print"] = Print; static const int Ps[] = { 40, 40, 91, 91, diff --git a/lib/letter_scripts.cpp b/lib/letter_scripts.cpp index 84166696..37162511 100644 --- a/lib/letter_scripts.cpp +++ b/lib/letter_scripts.cpp @@ -2,6 +2,377 @@ #include void reflex::Unicode::Tables::letter_scripts(void) { + static const int Alnum[] = { + 48, 57, + 65, 90, + 97, 122, + 181, 181, + 192, 214, + 216, 246, + 248, 442, + 444, 447, + 452, 452, + 454, 455, + 457, 458, + 460, 497, + 499, 659, + 661, 687, + 880, 883, + 886, 887, + 891, 893, + 895, 895, + 902, 902, + 904, 906, + 908, 908, + 910, 929, + 931, 1013, + 1015, 1153, + 1162, 1327, + 1329, 1366, + 1376, 1416, + 1632, 1641, + 1776, 1785, + 1984, 1993, + 2406, 2415, + 2534, 2543, + 2662, 2671, + 2790, 2799, + 2918, 2927, + 3046, 3055, + 3174, 3183, + 3302, 3311, + 3430, 3439, + 3558, 3567, + 3664, 3673, + 3792, 3801, + 3872, 3881, + 4160, 4169, + 4240, 4249, + 4256, 4293, + 4295, 4295, + 4301, 4301, + 4304, 4346, + 4349, 4351, + 5024, 5109, + 5112, 5117, + 6112, 6121, + 6160, 6169, + 6470, 6479, + 6608, 6617, + 6784, 6793, + 6800, 6809, + 6992, 7001, + 7088, 7097, + 7232, 7241, + 7248, 7257, + 7296, 7304, + 7312, 7354, + 7357, 7359, + 7424, 7467, + 7531, 7543, + 7545, 7578, + 7680, 7957, + 7960, 7965, + 7968, 8005, + 8008, 8013, + 8016, 8023, + 8025, 8025, + 8027, 8027, + 8029, 8029, + 8031, 8061, + 8064, 8071, + 8080, 8087, + 8096, 8103, + 8112, 8116, + 8118, 8123, + 8126, 8126, + 8130, 8132, + 8134, 8139, + 8144, 8147, + 8150, 8155, + 8160, 8172, + 8178, 8180, + 8182, 8187, + 8450, 8450, + 8455, 8455, + 8458, 8467, + 8469, 8469, + 8473, 8477, + 8484, 8484, + 8486, 8486, + 8488, 8488, + 8490, 8493, + 8495, 8500, + 8505, 8505, + 8508, 8511, + 8517, 8521, + 8526, 8526, + 8579, 8580, + 11264, 11387, + 11390, 11492, + 11499, 11502, + 11506, 11507, + 11520, 11557, + 11559, 11559, + 11565, 11565, + 42528, 42537, + 42560, 42605, + 42624, 42651, + 42786, 42863, + 42865, 42887, + 42891, 42894, + 42896, 42954, + 42960, 42961, + 42963, 42963, + 42965, 42969, + 42997, 42998, + 43002, 43002, + 43216, 43225, + 43264, 43273, + 43472, 43481, + 43504, 43513, + 43600, 43609, + 43824, 43866, + 43872, 43880, + 43888, 43967, + 44016, 44025, + 64256, 64262, + 64275, 64279, + 65296, 65305, + 65313, 65338, + 65345, 65370, + 66560, 66639, + 66720, 66729, + 66736, 66771, + 66776, 66811, + 66928, 66938, + 66940, 66954, + 66956, 66962, + 66964, 66965, + 66967, 66977, + 66979, 66993, + 66995, 67001, + 67003, 67004, + 68736, 68786, + 68800, 68850, + 68912, 68921, + 69734, 69743, + 69872, 69881, + 69942, 69951, + 70096, 70105, + 70384, 70393, + 70736, 70745, + 70864, 70873, + 71248, 71257, + 71360, 71369, + 71472, 71481, + 71840, 71913, + 72016, 72025, + 72784, 72793, + 73040, 73049, + 73120, 73129, + 73552, 73561, + 92768, 92777, + 92864, 92873, + 93008, 93017, + 93760, 93823, + 119808, 119892, + 119894, 119964, + 119966, 119967, + 119970, 119970, + 119973, 119974, + 119977, 119980, + 119982, 119993, + 119995, 119995, + 119997, 120003, + 120005, 120069, + 120071, 120074, + 120077, 120084, + 120086, 120092, + 120094, 120121, + 120123, 120126, + 120128, 120132, + 120134, 120134, + 120138, 120144, + 120146, 120485, + 120488, 120512, + 120514, 120538, + 120540, 120570, + 120572, 120596, + 120598, 120628, + 120630, 120654, + 120656, 120686, + 120688, 120712, + 120714, 120744, + 120746, 120770, + 120772, 120779, + 120782, 120831, + 122624, 122633, + 122635, 122654, + 122661, 122666, + 123200, 123209, + 123632, 123641, + 124144, 124153, + 125184, 125251, + 125264, 125273, + 130032, 130041, + 0, 0 + }; + range["Alnum"] = Alnum; + static const int Alpha[] = { + 65, 90, + 97, 122, + 181, 181, + 192, 214, + 216, 246, + 248, 442, + 444, 447, + 452, 452, + 454, 455, + 457, 458, + 460, 497, + 499, 659, + 661, 687, + 880, 883, + 886, 887, + 891, 893, + 895, 895, + 902, 902, + 904, 906, + 908, 908, + 910, 929, + 931, 1013, + 1015, 1153, + 1162, 1327, + 1329, 1366, + 1376, 1416, + 4256, 4293, + 4295, 4295, + 4301, 4301, + 4304, 4346, + 4349, 4351, + 5024, 5109, + 5112, 5117, + 7296, 7304, + 7312, 7354, + 7357, 7359, + 7424, 7467, + 7531, 7543, + 7545, 7578, + 7680, 7957, + 7960, 7965, + 7968, 8005, + 8008, 8013, + 8016, 8023, + 8025, 8025, + 8027, 8027, + 8029, 8029, + 8031, 8061, + 8064, 8071, + 8080, 8087, + 8096, 8103, + 8112, 8116, + 8118, 8123, + 8126, 8126, + 8130, 8132, + 8134, 8139, + 8144, 8147, + 8150, 8155, + 8160, 8172, + 8178, 8180, + 8182, 8187, + 8450, 8450, + 8455, 8455, + 8458, 8467, + 8469, 8469, + 8473, 8477, + 8484, 8484, + 8486, 8486, + 8488, 8488, + 8490, 8493, + 8495, 8500, + 8505, 8505, + 8508, 8511, + 8517, 8521, + 8526, 8526, + 8579, 8580, + 11264, 11387, + 11390, 11492, + 11499, 11502, + 11506, 11507, + 11520, 11557, + 11559, 11559, + 11565, 11565, + 42560, 42605, + 42624, 42651, + 42786, 42863, + 42865, 42887, + 42891, 42894, + 42896, 42954, + 42960, 42961, + 42963, 42963, + 42965, 42969, + 42997, 42998, + 43002, 43002, + 43824, 43866, + 43872, 43880, + 43888, 43967, + 64256, 64262, + 64275, 64279, + 65313, 65338, + 65345, 65370, + 66560, 66639, + 66736, 66771, + 66776, 66811, + 66928, 66938, + 66940, 66954, + 66956, 66962, + 66964, 66965, + 66967, 66977, + 66979, 66993, + 66995, 67001, + 67003, 67004, + 68736, 68786, + 68800, 68850, + 71840, 71903, + 93760, 93823, + 119808, 119892, + 119894, 119964, + 119966, 119967, + 119970, 119970, + 119973, 119974, + 119977, 119980, + 119982, 119993, + 119995, 119995, + 119997, 120003, + 120005, 120069, + 120071, 120074, + 120077, 120084, + 120086, 120092, + 120094, 120121, + 120123, 120126, + 120128, 120132, + 120134, 120134, + 120138, 120144, + 120146, 120485, + 120488, 120512, + 120514, 120538, + 120540, 120570, + 120572, 120596, + 120598, 120628, + 120630, 120654, + 120656, 120686, + 120688, 120712, + 120714, 120744, + 120746, 120770, + 120772, 120779, + 122624, 122633, + 122635, 122654, + 122661, 122666, + 125184, 125251, + 0, 0 + }; + range["Alpha"] = Alpha; static const int Ll[] = { 97, 122, 181, 181, diff --git a/lib/matcher.cpp b/lib/matcher.cpp index 1a35baf6..87358f25 100644 --- a/lib/matcher.cpp +++ b/lib/matcher.cpp @@ -54,11 +54,11 @@ size_t Matcher::match(Method method) col_ = 0; // count columns for indent matching #endif find: - int c1 = got_; + int ch = got_; bool bol = at_bol(); // at begin of line? #if !defined(WITH_NO_CODEGEN) if (pat_->fsm_ != NULL) - fsm_.c1 = c1; + fsm_.ch = ch; #endif #if !defined(WITH_NO_INDENT) redo: @@ -66,359 +66,367 @@ size_t Matcher::match(Method method) lap_.resize(0); cap_ = 0; bool nul = method == Const::MATCH; -#if !defined(WITH_NO_CODEGEN) - if (pat_->fsm_ != NULL) + if (!opt_.W || at_wb()) { - DBGLOG("FSM code %p", pat_->fsm_); - fsm_.bol = bol; - fsm_.nul = nul; - pat_->fsm_(*this); - nul = fsm_.nul; - c1 = fsm_.c1; - } - else + // skip to next line and keep searching if matching on anchor ^ and not at begin of line + if (method == Const::FIND && pat_->bol_ && !bol) + if (skip('\n')) + goto scan; +#if !defined(WITH_NO_CODEGEN) + if (pat_->fsm_ != NULL) + { + DBGLOG("FSM code %p", pat_->fsm_); + fsm_.bol = bol; + fsm_.nul = nul; + pat_->fsm_(*this); + nul = fsm_.nul; + ch = fsm_.ch; + } + else #endif - if (pat_->opc_ != NULL) - { - const Pattern::Opcode *pc = pat_->opc_; - Pattern::Index back = Pattern::Const::IMAX; // where to jump back to - size_t bpos = 0; // backtrack position in the input - while (true) - { - Pattern::Index jump; - Pattern::Opcode opcode = *pc; - DBGLOG("Fetch: code[%zu] = 0x%08X", pc - pat_->opc_, opcode); - if (!Pattern::is_opcode_goto(opcode)) + if (pat_->opc_ != NULL) + { + const Pattern::Opcode *pc = pat_->opc_; + Pattern::Index back = Pattern::Const::IMAX; // where to jump back to + size_t bpos = 0; // backtrack position in the input + while (true) { - switch (opcode >> 24) + Pattern::Index jump; + Pattern::Opcode opcode = *pc; + DBGLOG("Fetch: code[%zu] = 0x%08X", pc - pat_->opc_, opcode); + if (!Pattern::is_opcode_goto(opcode)) { - case 0xFE: // TAKE - cap_ = Pattern::long_index_of(opcode); - cur_ = pos_; - ++pc; - DBGLOG("Take: cap = %zu", cap_); - continue; - case 0xFD: // REDO - cap_ = Const::REDO; - DBGLOG("Redo"); - cur_ = pos_; - ++pc; - continue; - case 0xFC: // TAIL - { - Pattern::Lookahead la = Pattern::lookahead_of(opcode); - DBGLOG("Tail: %u", la); - if (lap_.size() > la && lap_[la] >= 0) - cur_ = txt_ - buf_ + static_cast(lap_[la]); // mind the (new) gap + switch (opcode >> 24) + { + case 0xFE: // TAKE + { + int c; + if (!opt_.W || (c = peek(), at_we(c, pos_))) + { + cap_ = Pattern::long_index_of(opcode); + DBGLOG("Take: cap = %zu", cap_); + cur_ = pos_; + } + } ++pc; continue; - } - case 0xFB: // HEAD - { - Pattern::Lookahead la = Pattern::lookahead_of(opcode); - DBGLOG("Head: lookahead[%u] = %zu", la, pos_ - (txt_ - buf_)); - if (lap_.size() <= la) - lap_.resize(la + 1, -1); - lap_[la] = static_cast(pos_ - (txt_ - buf_)); // mind the gap + case 0xFD: // REDO + cap_ = Const::REDO; + DBGLOG("Redo"); + cur_ = pos_; ++pc; continue; - } + case 0xFC: // TAIL + { + Pattern::Lookahead la = Pattern::lookahead_of(opcode); + DBGLOG("Tail: %u", la); + if (lap_.size() > la && lap_[la] >= 0) + cur_ = txt_ - buf_ + static_cast(lap_[la]); // mind the (new) gap + ++pc; + continue; + } + case 0xFB: // HEAD + { + Pattern::Lookahead la = Pattern::lookahead_of(opcode); + DBGLOG("Head: lookahead[%u] = %zu", la, pos_ - (txt_ - buf_)); + if (lap_.size() <= la) + lap_.resize(la + 1, -1); + lap_[la] = static_cast(pos_ - (txt_ - buf_)); // mind the gap + ++pc; + continue; + } #if !defined(WITH_NO_INDENT) - case Pattern::META_DED - Pattern::META_MIN: - if (ded_ > 0) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(pc[1]); - DBGLOG("Dedent ded = %zu", ded_); // unconditional dedent matching \j - nul = true; - pc = pat_->opc_ + jump; - continue; - } + case Pattern::META_DED - Pattern::META_MIN: + if (ded_ > 0) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(pc[1]); + DBGLOG("Dedent ded = %zu", ded_); // unconditional dedent matching \j + nul = true; + pc = pat_->opc_ + jump; + continue; + } #endif - } - if (c1 == EOF) - break; - int c0 = c1; - c1 = get(); - DBGLOG("Get: c1 = %d", c1); - // to jump to longest sequence of matching metas - jump = Pattern::Const::IMAX; - while (true) - { - if (jump == Pattern::Const::IMAX || back == Pattern::Const::IMAX) + } + if (ch == EOF) + break; + ch = get(); + DBGLOG("Get: ch = %d", ch); + // to jump to longest sequence of matching metas + jump = Pattern::Const::IMAX; + while (true) { - if (!Pattern::is_opcode_goto(opcode)) + if (jump == Pattern::Const::IMAX || back == Pattern::Const::IMAX) { - // we no longer have to pass through all if jump and back are set - switch (opcode >> 24) + if (!Pattern::is_opcode_goto(opcode)) { - case 0xFE: // TAKE - cap_ = Pattern::long_index_of(opcode); - cur_ = pos_; - if (c1 != EOF) - --cur_; // must unget one char - opcode = *++pc; - DBGLOG("Take: cap = %zu", cap_); - continue; - case 0xFD: // REDO - cap_ = Const::REDO; - DBGLOG("Redo"); - cur_ = pos_; - if (c1 != EOF) - --cur_; // must unget one char - opcode = *++pc; - continue; - case 0xFC: // TAIL - { - Pattern::Lookahead la = Pattern::lookahead_of(opcode); - DBGLOG("Tail: %u", la); - if (lap_.size() > la && lap_[la] >= 0) - cur_ = txt_ - buf_ + static_cast(lap_[la]); // mind the (new) gap + // we no longer have to pass through all if jump and back are set + switch (opcode >> 24) + { + case 0xFE: // TAKE + if (!opt_.W || at_we(ch, pos_ - 1)) + { + cap_ = Pattern::long_index_of(opcode); + DBGLOG("Take: cap = %zu", cap_); + cur_ = pos_; + if (ch != EOF) + --cur_; // must unget one char + } + opcode = *++pc; + continue; + case 0xFD: // REDO + cap_ = Const::REDO; + DBGLOG("Redo"); + cur_ = pos_; + if (ch != EOF) + --cur_; // must unget one char + opcode = *++pc; + continue; + case 0xFC: // TAIL + { + Pattern::Lookahead la = Pattern::lookahead_of(opcode); + DBGLOG("Tail: %u", la); + if (lap_.size() > la && lap_[la] >= 0) + cur_ = txt_ - buf_ + static_cast(lap_[la]); // mind the (new) gap + opcode = *++pc; + continue; + } + case 0xFB: // HEAD opcode = *++pc; continue; - } - case 0xFB: // HEAD - opcode = *++pc; - continue; #if !defined(WITH_NO_INDENT) - case Pattern::META_DED - Pattern::META_MIN: - DBGLOG("DED? %d", c1); - if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && dedent()) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_IND - Pattern::META_MIN: - DBGLOG("IND? %d", c1); - if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && indent()) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_UND - Pattern::META_MIN: - DBGLOG("UND"); - if (mrk_) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - mrk_ = false; - ded_ = 0; - opcode = *++pc; - continue; + case Pattern::META_DED - Pattern::META_MIN: + DBGLOG("DED? %d", ch); + if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && dedent()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_IND - Pattern::META_MIN: + DBGLOG("IND? %d", ch); + if (jump == Pattern::Const::IMAX && back == Pattern::Const::IMAX && bol && indent()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_UND - Pattern::META_MIN: + DBGLOG("UND"); + if (mrk_) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + mrk_ = false; + ded_ = 0; + opcode = *++pc; + continue; #endif - case Pattern::META_EOB - Pattern::META_MIN: - DBGLOG("EOB? %d", c1); - if (jump == Pattern::Const::IMAX && c1 == EOF) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_BOB - Pattern::META_MIN: - DBGLOG("BOB? %d", at_bob()); - if (jump == Pattern::Const::IMAX && at_bob()) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_EOL - Pattern::META_MIN: - DBGLOG("EOL? %d", c1); - anc_ = true; - if (jump == Pattern::Const::IMAX && (c1 == EOF || c1 == '\n' || (c1 == '\r' && peek() == '\n'))) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_BOL - Pattern::META_MIN: - DBGLOG("BOL? %d", bol); - anc_ = true; - if (jump == Pattern::Const::IMAX && bol) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_EWE - Pattern::META_MIN: - DBGLOG("EWE? %d %d %d", c0, c1, isword(c0) && !isword(c1)); - anc_ = true; - if (jump == Pattern::Const::IMAX && (isword(c0) || opt_.W) && !isword(c1)) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_BWE - Pattern::META_MIN: - DBGLOG("BWE? %d %d %d", c0, c1, !isword(c0) && isword(c1)); - anc_ = true; - if (jump == Pattern::Const::IMAX && !isword(c0) && isword(c1)) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_EWB - Pattern::META_MIN: - DBGLOG("EWB? %d", at_eow()); - anc_ = true; - if (jump == Pattern::Const::IMAX && isword(got_) && - !isword(static_cast(txt_[len_]))) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_BWB - Pattern::META_MIN: - DBGLOG("BWB? %d", at_bow()); - anc_ = true; - if (jump == Pattern::Const::IMAX && !isword(got_) && - (opt_.W || isword(static_cast(txt_[len_])))) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_NWE - Pattern::META_MIN: - DBGLOG("NWE? %d %d %d", c0, c1, isword(c0) == isword(c1)); - anc_ = true; - if (jump == Pattern::Const::IMAX && isword(c0) == isword(c1)) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_NWB - Pattern::META_MIN: - DBGLOG("NWB? %d %d", at_bow(), at_eow()); - anc_ = true; - if (jump == Pattern::Const::IMAX && - isword(got_) == isword(static_cast(txt_[len_]))) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_WBE - Pattern::META_MIN: - DBGLOG("WBE? %d %d %d", c0, c1, isword(c0) != isword(c1)); - anc_ = true; - if (jump == Pattern::Const::IMAX && isword(c0) != isword(c1)) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case Pattern::META_WBB - Pattern::META_MIN: - DBGLOG("WBB? %d %d", at_bow(), at_eow()); - anc_ = true; - if (jump == Pattern::Const::IMAX && - isword(got_) != isword(static_cast(txt_[len_]))) - { - jump = Pattern::index_of(opcode); - if (jump == Pattern::Const::LONG) - jump = Pattern::long_index_of(*++pc); - } - opcode = *++pc; - continue; - case 0xFF: // LONG - opcode = *++pc; - continue; + case Pattern::META_EOB - Pattern::META_MIN: + DBGLOG("EOB? %d", ch); + if (jump == Pattern::Const::IMAX && ch == EOF) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_BOB - Pattern::META_MIN: + DBGLOG("BOB? %d", at_bob()); + if (jump == Pattern::Const::IMAX && at_bob()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_EOL - Pattern::META_MIN: + DBGLOG("EOL? %d", ch); + anc_ = true; + if (jump == Pattern::Const::IMAX && + (ch == EOF || ch == '\n' || (ch == '\r' && peek() == '\n'))) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_BOL - Pattern::META_MIN: + DBGLOG("BOL? %d", bol); + anc_ = true; + if (jump == Pattern::Const::IMAX && bol) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_EWE - Pattern::META_MIN: + DBGLOG("EWE? %d", at_ewe(ch)); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_ewe(ch)) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_BWE - Pattern::META_MIN: + DBGLOG("BWE? %d", at_bwe(ch)); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_bwe(ch)) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_EWB - Pattern::META_MIN: + DBGLOG("EWB? %d", at_ewb()); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_ewb()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_BWB - Pattern::META_MIN: + DBGLOG("BWB? %d", at_bwb()); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_bwb()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_NWE - Pattern::META_MIN: + DBGLOG("NWE? %d", at_nwe(ch)); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_nwe(ch)) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_NWB - Pattern::META_MIN: + DBGLOG("NWB? %d", at_nwb()); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_nwb()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_WBE - Pattern::META_MIN: + DBGLOG("WBE? %d", at_wbe(ch)); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_wbe(ch)) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case Pattern::META_WBB - Pattern::META_MIN: + DBGLOG("WBB? %d", at_wbb()); + anc_ = true; + if (jump == Pattern::Const::IMAX && at_wbb()) + { + jump = Pattern::index_of(opcode); + if (jump == Pattern::Const::LONG) + jump = Pattern::long_index_of(*++pc); + } + opcode = *++pc; + continue; + case 0xFF: // LONG + opcode = *++pc; + continue; + } + } + else if (ch != EOF && !Pattern::is_opcode_halt(opcode)) + { + if (jump == Pattern::Const::IMAX) + break; + if (back == Pattern::Const::IMAX) + { + back = static_cast(pc - pat_->opc_); + bpos = pos_ - (txt_ - buf_) - 1; + DBGLOG("Backtrack point: back = %u pos = %zu", back, bpos); + } + pc = pat_->opc_ + jump; + opcode = *pc; } } - else if (c1 != EOF && !Pattern::is_opcode_halt(opcode)) + if (jump == Pattern::Const::IMAX) { - if (jump == Pattern::Const::IMAX) - break; - if (back == Pattern::Const::IMAX) + if (back != Pattern::Const::IMAX) { - back = static_cast(pc - pat_->opc_); - bpos = pos_ - (txt_ - buf_) - 1; - DBGLOG("Backtrack point: back = %u pos = %zu", back, bpos); + pc = pat_->opc_ + back; + opcode = *pc; + back = Pattern::Const::IMAX; } - pc = pat_->opc_ + jump; - opcode = *pc; + break; + } + DBGLOG("Try jump = %u", jump); + if (back == Pattern::Const::IMAX) + { + back = static_cast(pc - pat_->opc_); + bpos = pos_ - (txt_ - buf_) - 1; + DBGLOG("Backtrack point: back = %u pos = %zu", back, bpos); } + pc = pat_->opc_ + jump; + opcode = *pc; + jump = Pattern::Const::IMAX; } - if (jump == Pattern::Const::IMAX) + if (ch == EOF) + break; + } + else + { + if (Pattern::is_opcode_halt(opcode)) { if (back != Pattern::Const::IMAX) { + pos_ = (txt_ - buf_) + bpos; pc = pat_->opc_ + back; - opcode = *pc; + DBGLOG("Backtrack: back = %u pos = %zu ch = %d", back, pos_, ch); back = Pattern::Const::IMAX; + continue; } break; } - DBGLOG("Try jump = %u", jump); - if (back == Pattern::Const::IMAX) - { - back = static_cast(pc - pat_->opc_); - bpos = pos_ - (txt_ - buf_) - 1; - DBGLOG("Backtrack point: back = %u pos = %zu", back, bpos); - } - pc = pat_->opc_ + jump; - opcode = *pc; - jump = Pattern::Const::IMAX; - } - if (c1 == EOF) - break; - } - else - { - if (Pattern::is_opcode_halt(opcode)) - { - if (back != Pattern::Const::IMAX) - { - pos_ = (txt_ - buf_) + bpos; - pc = pat_->opc_ + back; - DBGLOG("Backtrack: back = %u pos = %zu c1 = %d", back, pos_, c1); - back = Pattern::Const::IMAX; - continue; - } - break; + if (ch == EOF) + break; + ch = get(); + DBGLOG("Get: ch = %d (0x%x) at pos %zu", ch, ch, pos_ - 1); + if (ch == EOF) + break; } - if (c1 == EOF) - break; - c1 = get(); - DBGLOG("Get: c1 = %d (0x%x) at pos %zu", c1, c1, pos_ - 1); - if (c1 == EOF) - break; - } - Pattern::Opcode lo = c1 << 24; - Pattern::Opcode hi = lo | 0x00FFFFFF; -unrolled: - if (hi < opcode || lo > (opcode << 8)) - { - opcode = *++pc; + Pattern::Opcode lo = ch << 24; + Pattern::Opcode hi = lo | 0x00FFFFFF; + unrolled: if (hi < opcode || lo > (opcode << 8)) { opcode = *++pc; @@ -440,7 +448,11 @@ size_t Matcher::match(Method method) if (hi < opcode || lo > (opcode << 8)) { opcode = *++pc; - goto unrolled; + if (hi < opcode || lo > (opcode << 8)) + { + opcode = *++pc; + goto unrolled; + } } } } @@ -448,49 +460,49 @@ size_t Matcher::match(Method method) } } } - } - jump = Pattern::index_of(opcode); - if (jump == 0) - { - // loop back to start state w/o full match: advance to avoid backtracking - if (cap_ == 0 && method == Const::FIND) + jump = Pattern::index_of(opcode); + if (jump == 0) { - if (cur_ + 1 == pos_) - { - // matched one char in a loop, do not backtrack here - ++cur_; - if (retry > 0) - --retry; - } - else + // loop back to start state w/o full match: advance to avoid backtracking + if (cap_ == 0 && method == Const::FIND) { - // check each char in buf_[cur_+1..pos_-1] if it is a starting char, if not then increase cur_ - while (cur_ + 1 < pos_ && !pat_->fst_.test(static_cast(buf_[cur_ + 1]))) + if (cur_ + 1 == pos_) { + // matched one char in a loop, do not backtrack here ++cur_; if (retry > 0) --retry; } + else + { + // check each char in buf_[cur_+1..pos_-1] if it is a starting char, if not then increase cur_ + while (cur_ + 1 < pos_ && !pat_->fst_.test(static_cast(buf_[cur_ + 1]))) + { + ++cur_; + if (retry > 0) + --retry; + } + } } } - } - else if (jump >= Pattern::Const::LONG) - { - if (jump == Pattern::Const::HALT) + else if (jump >= Pattern::Const::LONG) { - if (back != Pattern::Const::IMAX) + if (jump == Pattern::Const::HALT) { - pc = pat_->opc_ + back; - pos_ = (txt_ - buf_) + bpos; - DBGLOG("Backtrack: back = %u pos = %zu c1 = %d", back, pos_, c1); - back = Pattern::Const::IMAX; - continue; + if (back != Pattern::Const::IMAX) + { + pc = pat_->opc_ + back; + pos_ = (txt_ - buf_) + bpos; + DBGLOG("Backtrack: back = %u pos = %zu ch = %d", back, pos_, ch); + back = Pattern::Const::IMAX; + continue; + } + break; } - break; + jump = Pattern::long_index_of(pc[1]); } - jump = Pattern::long_index_of(pc[1]); + pc = pat_->opc_ + jump; } - pc = pat_->opc_ + jump; } } #if !defined(WITH_NO_INDENT) @@ -613,9 +625,13 @@ size_t Matcher::match(Method method) } if (!pat_->one_) goto scan; + size_t k = cur_ + pat_->len_; + ch = k < end_ ? static_cast(buf_[k]) : EOF; + if (opt_.W && (!at_wb() || !(at_end() || at_we(ch, k)))) + goto scan; txt_ = buf_ + cur_; len_ = pat_->len_; - set_current(cur_ + len_); + set_current(k); return cap_ = 1; } } diff --git a/lib/pattern.cpp b/lib/pattern.cpp index 3a6d2f8e..100e1f56 100644 --- a/lib/pattern.cpp +++ b/lib/pattern.cpp @@ -45,10 +45,10 @@ /// DFA compaction: -1 == reverse order edge compression (best); 1 == edge compression; 0 == no edge compression. /** Edge compression reorders edges to produce fewer tests when executed in the compacted order. For example ([a-cg-ik]|d|[e-g]|j|y|[x-z]) after reverse edge compression has only 2 edges: - c1 = m.FSM_CHAR(); - if ('x' <= c1 && c1 <= 'z') goto S3; - if ('a' <= c1 && c1 <= 'k') goto S3; - return m.FSM_HALT(c1); + c = m.FSM_CHAR(); + if ('x' <= c && c <= 'z') goto S3; + if ('a' <= c && c <= 'k') goto S3; + return m.FSM_HALT(c); */ #define WITH_COMPACT_DFA -1 @@ -177,6 +177,7 @@ void Pattern::init(const char *options, const uint8_t *pred) bmd_ = 0; npy_ = 0; one_ = false; + bol_ = false; vno_ = 0; eno_ = 0; hno_ = 0; @@ -197,6 +198,7 @@ void Pattern::init(const char *options, const uint8_t *pred) len_ = pred[0]; min_ = pred[1] & 0x0f; one_ = pred[1] & 0x10; + bol_ = pred[1] & 0x40; memcpy(chr_, pred + 2, len_); size_t n = 2 + len_; if (len_ == 0) @@ -658,6 +660,7 @@ void Pattern::parse( loc = 0; } } + bol_ = at(loc) == '^'; do { Location end = loc; @@ -762,6 +765,8 @@ void Pattern::parse( } else { + if (at(loc) != '^') + bol_ = false; parse2( true, loc, @@ -2990,8 +2995,8 @@ void Pattern::gencode_dfa(const DFA::State *start) const ::fprintf(file, "void reflex_code_%s(reflex::Matcher& m)\n" "{\n" - " int c0 = 0, c1 = 0;\n" - " m.FSM_INIT(c1);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str()); + " int c = 0;\n" + " m.FSM_INIT(c);\n", opt_.n.empty() ? "FSM" : opt_.n.c_str()); for (const DFA::State *state = start; state != NULL; state = state->next) { ::fprintf(file, "\nS%u:\n", state->index); @@ -3007,8 +3012,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const ::fprintf(file, " m.FSM_HEAD(%u);\n", *i); if (state->edges.rbegin() != state->edges.rend() && state->edges.rbegin()->first == META_DED) ::fprintf(file, " if (m.FSM_DENT()) goto S%u;\n", state->edges.rbegin()->second.second->index); - bool peek = false; // if we need to read a character into c1 - bool prev = false; // if we need to keep the previous character in c0 + bool peek = false; // if we need to read a character into c for (DFA::State::Edges::const_reverse_iterator i = state->edges.rbegin(); i != state->edges.rend(); ++i) { #if WITH_COMPACT_DFA == -1 @@ -3022,13 +3026,12 @@ void Pattern::gencode_dfa(const DFA::State *start) const { do { - if (lo == META_EOB || lo == META_EOL) + if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE) + { peek = true; - else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE) - prev = peek = true; - if (prev && peek) break; - check_dfa_closure(i->second.second, 1, peek, prev); + } + check_dfa_closure(i->second.second, 1, peek); } while (++lo <= hi); } else @@ -3054,10 +3057,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const target_index = i->second.second->index; if (read) { - if (prev) - ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n"); - else - ::fprintf(file, " c1 = m.FSM_CHAR();\n"); + ::fprintf(file, " c = m.FSM_CHAR();\n"); read = false; } if (is_meta(lo)) @@ -3068,14 +3068,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const { case META_EOB: case META_EOL: - ::fprintf(file, " "); - if (elif) - ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]); - gencode_dfa_closure(file, i->second.second, 2, peek); - ::fprintf(file, " }\n"); - elif = true; - break; case META_EWE: case META_BWE: case META_NWE: @@ -3083,7 +3075,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const ::fprintf(file, " "); if (elif) ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); + ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]); gencode_dfa_closure(file, i->second.second, 2, peek); ::fprintf(file, " }\n"); elif = true; @@ -3106,7 +3098,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const break; if (lo == hi) { - ::fprintf(file, " if (c1 == "); + ::fprintf(file, " if (c == "); print_char(file, lo); ::fprintf(file, ")"); } @@ -3114,20 +3106,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const { ::fprintf(file, " if ("); print_char(file, lo); - ::fprintf(file, " <= c1)"); + ::fprintf(file, " <= c)"); } else { ::fprintf(file, " if ("); print_char(file, lo); - ::fprintf(file, " <= c1 && c1 <= "); + ::fprintf(file, " <= c && c <= "); print_char(file, hi); ::fprintf(file, ")"); } if (target_index == Const::IMAX) { if (peek) - ::fprintf(file, " return m.FSM_HALT(c1);\n"); + ::fprintf(file, " return m.FSM_HALT(c);\n"); else ::fprintf(file, " return m.FSM_HALT();\n"); } @@ -3146,10 +3138,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const { if (read) { - if (prev) - ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n"); - else - ::fprintf(file, " c1 = m.FSM_CHAR();\n"); + ::fprintf(file, " c = m.FSM_CHAR();\n"); read = false; } do @@ -3158,14 +3147,6 @@ void Pattern::gencode_dfa(const DFA::State *start) const { case META_EOB: case META_EOL: - ::fprintf(file, " "); - if (elif) - ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]); - gencode_dfa_closure(file, i->second.second, 2, peek); - ::fprintf(file, " }\n"); - elif = true; - break; case META_EWE: case META_BWE: case META_NWE: @@ -3173,7 +3154,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const ::fprintf(file, " "); if (elif) ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); + ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]); gencode_dfa_closure(file, i->second.second, 2, peek); ::fprintf(file, " }\n"); elif = true; @@ -3199,10 +3180,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const target_index = i->second.second->index; if (read) { - if (prev) - ::fprintf(file, " c0 = c1, c1 = m.FSM_CHAR();\n"); - else - ::fprintf(file, " c1 = m.FSM_CHAR();\n"); + ::fprintf(file, " c = m.FSM_CHAR();\n"); read = false; } if (!is_meta(lo)) @@ -3212,7 +3190,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const break; if (lo == hi) { - ::fprintf(file, " if (c1 == "); + ::fprintf(file, " if (c == "); print_char(file, lo); ::fprintf(file, ")"); } @@ -3220,20 +3198,20 @@ void Pattern::gencode_dfa(const DFA::State *start) const { ::fprintf(file, " if ("); print_char(file, lo); - ::fprintf(file, " <= c1)"); + ::fprintf(file, " <= c)"); } else { ::fprintf(file, " if ("); print_char(file, lo); - ::fprintf(file, " <= c1 && c1 <= "); + ::fprintf(file, " <= c && c <= "); print_char(file, hi); ::fprintf(file, ")"); } if (target_index == Const::IMAX) { if (peek) - ::fprintf(file, " return m.FSM_HALT(c1);\n"); + ::fprintf(file, " return m.FSM_HALT(c);\n"); else ::fprintf(file, " return m.FSM_HALT();\n"); } @@ -3245,7 +3223,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const } #endif if (peek) - ::fprintf(file, " return m.FSM_HALT(c1);\n"); + ::fprintf(file, " return m.FSM_HALT(c);\n"); else ::fprintf(file, " return m.FSM_HALT();\n"); } @@ -3263,7 +3241,7 @@ void Pattern::gencode_dfa(const DFA::State *start) const } #ifndef WITH_NO_CODEGEN -void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, bool& prev) const +void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek) const { if (nest > 5) return; @@ -3280,13 +3258,12 @@ void Pattern::check_dfa_closure(const DFA::State *state, int nest, bool& peek, b { do { - if (lo == META_EOB || lo == META_EOL) + if (lo == META_EOB || lo == META_EOL || lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE) + { peek = true; - else if (lo == META_EWE || lo == META_BWE || lo == META_NWE || lo == META_WBE) - prev = peek = true; - if (prev && peek) break; - check_dfa_closure(i->second.second, nest + 1, peek, prev); + } + check_dfa_closure(i->second.second, nest + 1, peek); } while (++lo <= hi); } } @@ -3300,14 +3277,14 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, if (state->redo) { if (peek) - ::fprintf(file, "%*sm.FSM_REDO(c1);\n", 2*nest, ""); + ::fprintf(file, "%*sm.FSM_REDO(c);\n", 2*nest, ""); else ::fprintf(file, "%*sm.FSM_REDO();\n", 2*nest, ""); } else if (state->accept > 0) { if (peek) - ::fprintf(file, "%*sm.FSM_TAKE(%u, c1);\n", 2*nest, "", state->accept); + ::fprintf(file, "%*sm.FSM_TAKE(%u, c);\n", 2*nest, "", state->accept); else ::fprintf(file, "%*sm.FSM_TAKE(%u);\n", 2*nest, "", state->accept); } @@ -3332,14 +3309,6 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, { case META_EOB: case META_EOL: - ::fprintf(file, "%*s", 2*nest, ""); - if (elif) - ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c1)) {\n", meta_label[lo - META_MIN]); - gencode_dfa_closure(file, i->second.second, nest + 1, peek); - ::fprintf(file, "%*s}\n", 2*nest, ""); - elif = true; - break; case META_EWE: case META_BWE: case META_NWE: @@ -3347,7 +3316,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, ::fprintf(file, "%*s", 2*nest, ""); if (elif) ::fprintf(file, "else "); - ::fprintf(file, "if (m.FSM_META_%s(c0, c1)) {\n", meta_label[lo - META_MIN]); + ::fprintf(file, "if (m.FSM_META_%s(c)) {\n", meta_label[lo - META_MIN]); gencode_dfa_closure(file, i->second.second, nest + 1, peek); ::fprintf(file, "%*s}\n", 2*nest, ""); elif = true; @@ -3375,7 +3344,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, ::fprintf(file, "%*s", 2*nest, ""); if (lo == hi) { - ::fprintf(file, "if (c1 == "); + ::fprintf(file, "if (c == "); print_char(file, lo); ::fprintf(file, ")"); } @@ -3383,20 +3352,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, { ::fprintf(file, "if ("); print_char(file, lo); - ::fprintf(file, " <= c1)"); + ::fprintf(file, " <= c)"); } else { ::fprintf(file, "if ("); print_char(file, lo); - ::fprintf(file, " <= c1 && c1 <= "); + ::fprintf(file, " <= c && c <= "); print_char(file, hi); ::fprintf(file, ")"); } if (target_index == Const::IMAX) { if (peek) - ::fprintf(file, " return m.FSM_HALT(c1);\n"); + ::fprintf(file, " return m.FSM_HALT(c);\n"); else ::fprintf(file, " return m.FSM_HALT();\n"); } @@ -3423,7 +3392,7 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, ::fprintf(file, "%*s", 2*nest, ""); if (lo == hi) { - ::fprintf(file, "if (c1 == "); + ::fprintf(file, "if (c == "); print_char(file, lo); ::fprintf(file, ")"); } @@ -3431,20 +3400,20 @@ void Pattern::gencode_dfa_closure(FILE *file, const DFA::State *state, int nest, { ::fprintf(file, "if ("); print_char(file, lo); - ::fprintf(file, " <= c1)"); + ::fprintf(file, " <= c)"); } else { ::fprintf(file, "if ("); print_char(file, lo); - ::fprintf(file, " <= c1 && c1 <= "); + ::fprintf(file, " <= c && c <= "); print_char(file, hi); ::fprintf(file, ")"); } if (target_index == Const::IMAX) { if (peek) - ::fprintf(file, " return m.FSM_HALT(c1);\n"); + ::fprintf(file, " return m.FSM_HALT(c);\n"); else ::fprintf(file, " return m.FSM_HALT();\n"); } @@ -4589,7 +4558,7 @@ bool Pattern::match_hfa_transitions(size_t level, const HFA::Hashes& hashes, con void Pattern::write_predictor(FILE *file) const { ::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (len_ == 0) * 256 + Const::HASH + (lbk_ > 0) * 68); - ::fprintf(file, "\n %3hhu,%3hhu,", static_cast(len_), (static_cast(min_ | (one_ << 4) | ((lbk_ > 0) << 5)))); + ::fprintf(file, "\n %3hhu,%3hhu,", static_cast(len_), (static_cast(min_ | (one_ << 4) | ((lbk_ > 0) << 5) | (bol_ << 6)))); // save match characters chr_[0..len_-1] for (size_t i = 0; i < len_; ++i) ::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast(chr_[i])); diff --git a/lib/unicode.cpp b/lib/unicode.cpp index a63badf1..4dd782ee 100644 --- a/lib/unicode.cpp +++ b/lib/unicode.cpp @@ -83,9 +83,15 @@ Tables::Tables() range["Control"] = range["Cc"]; range["Format"] = range["Cf"]; - range["d"] = range["Decimal_Digit_Number"]; - range["l"] = range["Lowercase_Letter"]; - range["u"] = range["Uppercase_Letter"]; + range["Cntrl"] = range["C"]; + range["Digit"] = range["Nd"]; + range["Lower"] = range["Ll"]; + range["Punct"] = range["P"]; + range["Upper"] = range["Lu"]; + + range["d"] = range["Digit"]; + range["l"] = range["Lower"]; + range["u"] = range["Upper"]; range["s"] = range["Space"]; range["w"] = range["Word"]; } diff --git a/makemake.sh b/makemake.sh index b9829a47..dd2e496a 100755 --- a/makemake.sh +++ b/makemake.sh @@ -40,6 +40,7 @@ sed "s/define UGREP_VERSION \"[^\"]*\"/define UGREP_VERSION \"$1\"/" src/ugrep-i ./build.sh || exit 1 ./man.sh $1 +pushd completions/bash ; ./compgen.sh > /dev/null ; popd || exit 1 pushd completions/fish ; ./compgen.sh > /dev/null ; popd || exit 1 pushd completions/zsh ; ./compgen.sh > /dev/null ; popd || exit 1 diff --git a/man.sh b/man.sh index 48e1b002..3971f3db 100755 --- a/man.sh +++ b/man.sh @@ -75,9 +75,9 @@ forces empty matches for compatibility with other grep tools. .PP Option \fB-f\fR \fIFILE\fR matches patterns specified in \fIFILE\fR. .PP -By default Unicode patterns are matched. Option \fB-U\fR (\fB--binary\fR) -disables Unicode matching for ASCII and binary pattern matching. Non-Unicode -matching is generally more efficient. +By default Unicode patterns are matched. Option \fB-U\fR (\fB--ascii\fR or +\fB--binary\fR) disables Unicode matching for ASCII and binary pattern +matching. Non-Unicode matching is more efficient. .PP \fBugrep\fR accepts input of various encoding formats and normalizes the output to UTF-8. When a UTF byte order mark is present in the input, the input is @@ -555,10 +555,6 @@ List all Unicode words in a file: .IP $ ugrep -o '\\w+' myfile.txt .PP -List all ASCII words in a file: -.IP -$ ugrep -o '[[:word:]]+' myfile.txt -.PP List the laughing face emojis (Unicode code points U+1F600 to U+1F60F): .IP $ ugrep -o '[\\x{1F600}-\\x{1F60F}]' myfile.txt diff --git a/man/ug.1 b/man/ug.1 index 6530b432..b2dbab51 100644 --- a/man/ug.1 +++ b/man/ug.1 @@ -1,4 +1,4 @@ -.TH UGREP "1" "May 06, 2024" "ugrep 6.0.0" "User Commands" +.TH UGREP "1" "June 03, 2024" "ugrep 6.1.0" "User Commands" .SH NAME \fBugrep\fR, \fBug\fR -- file pattern searcher .SH SYNOPSIS @@ -60,9 +60,9 @@ forces empty matches for compatibility with other grep tools. .PP Option \fB-f\fR \fIFILE\fR matches patterns specified in \fIFILE\fR. .PP -By default Unicode patterns are matched. Option \fB-U\fR (\fB--binary\fR) -disables Unicode matching for ASCII and binary pattern matching. Non-Unicode -matching is generally more efficient. +By default Unicode patterns are matched. Option \fB-U\fR (\fB--ascii\fR or +\fB--binary\fR) disables Unicode matching for ASCII and binary pattern +matching. Non-Unicode matching is more efficient. .PP \fBugrep\fR accepts input of various encoding formats and normalizes the output to UTF-8. When a UTF byte order mark is present in the input, the input is @@ -261,8 +261,8 @@ unless \fB\-\-config\fR=\fIFILE\fR or \fB\-\-no\-config\fR is specified. \fB\-\-no\-config\fR Do not automatically load the default .ugrep configuration file. .TP -\fB\-\-confirm\fR -Confirm actions in \fB\-Q\fR query TUI. The default is confirm. +\fB\-\-no\-confirm\fR +Do not confirm actions in \fB\-Q\fR query TUI. The default is confirm. .TP \fB\-\-cpp\fR Output file matches in C++. See also options \fB\-\-format\fR and \fB\-u\fR. @@ -714,7 +714,7 @@ Shift\-Tab to navigate directories and to select a file to search. Press Enter to select lines to output. Press ALT\-l for option \fB\-l\fR to list files, ALT\-n for \fB\-n\fR, etc. Non\-option commands include ALT\-] to increase context and ALT\-} to increase fuzzyness. See -also options \fB\-\-confirm\fR, \fB\-\-delay\fR, \fB\-\-split\fR and \fB\-\-view\fR. +also options \fB\-\-no\-confirm\fR, \fB\-\-delay\fR, \fB\-\-split\fR and \fB\-\-view\fR. .TP \fB\-q\fR, \fB\-\-quiet\fR, \fB\-\-silent\fR Quiet mode: suppress all output. Only search a file until a match @@ -855,12 +855,8 @@ combine option \fB\-\-hexdump\fR with option \fB\-W\fR. See also option \fB\-U\ \fB\-w\fR, \fB\-\-word\-regexp\fR The PATTERN is searched for as a word, such that the matching text is preceded by a non\-word character and is followed by a non\-word -character. Word characters are letters, digits and the -underscore. With option \fB\-P\fR, word characters are Unicode letters, -digits and underscore. This option has no effect if \fB\-x\fR is also -specified. If a PATTERN is specified, or \fB\-e\fR \fIPATTERN\fR or \fB\-N\fR \fIPATTERN\fR, -then this option has no effect on \fB\-f\fR \fIFILE\fR patterns to allow \fB\-f\fR \fIFILE\fR -patterns to narrow or widen the scope of the PATTERN search. +character. Word\-like characters are Unicode letters, digits and +connector punctuations such as underscore. .TP \fB\-\-width\fR[=\fINUM\fR] Truncate the output to NUM visible characters per line. The width @@ -874,10 +870,7 @@ line from the hex output use option \fB\-\-hexdump\fR. See also option \fB\-U\f .TP \fB\-x\fR, \fB\-\-line\-regexp\fR Select only those matches that exactly match the whole line, as if -the patterns are surrounded by ^ and $. If a PATTERN is specified, -or \fB\-e\fR \fIPATTERN\fR or \fB\-N\fR \fIPATTERN\fR, then this option has no effect on -\fB\-f\fR \fIFILE\fR patterns to allow \fB\-f\fR \fIFILE\fR patterns to narrow or widen the -scope of the PATTERN search. +the patterns are surrounded by ^ and $. .TP \fB\-\-xml\fR Output file matches in XML. If \fB\-H\fR, \fB\-n\fR, \fB\-k\fR, or \fB\-b\fR is specified, @@ -1329,10 +1322,6 @@ List all Unicode words in a file: .IP $ ugrep -o '\\w+' myfile.txt .PP -List all ASCII words in a file: -.IP -$ ugrep -o '[[:word:]]+' myfile.txt -.PP List the laughing face emojis (Unicode code points U+1F600 to U+1F60F): .IP $ ugrep -o '[\\x{1F600}-\\x{1F60F}]' myfile.txt diff --git a/man/ugrep-indexer.1 b/man/ugrep-indexer.1 index da75dd88..9396e985 100644 --- a/man/ugrep-indexer.1 +++ b/man/ugrep-indexer.1 @@ -1,4 +1,4 @@ -.TH UGREP-INDEXER "1" "May 06, 2024" "ugrep-indexer 6.0.0" "User Commands" +.TH UGREP-INDEXER "1" "June 03, 2024" "ugrep-indexer 6.1.0" "User Commands" .SH NAME \fBugrep-indexer\fR -- file indexer to accelerate recursive searching .SH SYNOPSIS diff --git a/man/ugrep.1 b/man/ugrep.1 index 6530b432..b2dbab51 100644 --- a/man/ugrep.1 +++ b/man/ugrep.1 @@ -1,4 +1,4 @@ -.TH UGREP "1" "May 06, 2024" "ugrep 6.0.0" "User Commands" +.TH UGREP "1" "June 03, 2024" "ugrep 6.1.0" "User Commands" .SH NAME \fBugrep\fR, \fBug\fR -- file pattern searcher .SH SYNOPSIS @@ -60,9 +60,9 @@ forces empty matches for compatibility with other grep tools. .PP Option \fB-f\fR \fIFILE\fR matches patterns specified in \fIFILE\fR. .PP -By default Unicode patterns are matched. Option \fB-U\fR (\fB--binary\fR) -disables Unicode matching for ASCII and binary pattern matching. Non-Unicode -matching is generally more efficient. +By default Unicode patterns are matched. Option \fB-U\fR (\fB--ascii\fR or +\fB--binary\fR) disables Unicode matching for ASCII and binary pattern +matching. Non-Unicode matching is more efficient. .PP \fBugrep\fR accepts input of various encoding formats and normalizes the output to UTF-8. When a UTF byte order mark is present in the input, the input is @@ -261,8 +261,8 @@ unless \fB\-\-config\fR=\fIFILE\fR or \fB\-\-no\-config\fR is specified. \fB\-\-no\-config\fR Do not automatically load the default .ugrep configuration file. .TP -\fB\-\-confirm\fR -Confirm actions in \fB\-Q\fR query TUI. The default is confirm. +\fB\-\-no\-confirm\fR +Do not confirm actions in \fB\-Q\fR query TUI. The default is confirm. .TP \fB\-\-cpp\fR Output file matches in C++. See also options \fB\-\-format\fR and \fB\-u\fR. @@ -714,7 +714,7 @@ Shift\-Tab to navigate directories and to select a file to search. Press Enter to select lines to output. Press ALT\-l for option \fB\-l\fR to list files, ALT\-n for \fB\-n\fR, etc. Non\-option commands include ALT\-] to increase context and ALT\-} to increase fuzzyness. See -also options \fB\-\-confirm\fR, \fB\-\-delay\fR, \fB\-\-split\fR and \fB\-\-view\fR. +also options \fB\-\-no\-confirm\fR, \fB\-\-delay\fR, \fB\-\-split\fR and \fB\-\-view\fR. .TP \fB\-q\fR, \fB\-\-quiet\fR, \fB\-\-silent\fR Quiet mode: suppress all output. Only search a file until a match @@ -855,12 +855,8 @@ combine option \fB\-\-hexdump\fR with option \fB\-W\fR. See also option \fB\-U\ \fB\-w\fR, \fB\-\-word\-regexp\fR The PATTERN is searched for as a word, such that the matching text is preceded by a non\-word character and is followed by a non\-word -character. Word characters are letters, digits and the -underscore. With option \fB\-P\fR, word characters are Unicode letters, -digits and underscore. This option has no effect if \fB\-x\fR is also -specified. If a PATTERN is specified, or \fB\-e\fR \fIPATTERN\fR or \fB\-N\fR \fIPATTERN\fR, -then this option has no effect on \fB\-f\fR \fIFILE\fR patterns to allow \fB\-f\fR \fIFILE\fR -patterns to narrow or widen the scope of the PATTERN search. +character. Word\-like characters are Unicode letters, digits and +connector punctuations such as underscore. .TP \fB\-\-width\fR[=\fINUM\fR] Truncate the output to NUM visible characters per line. The width @@ -874,10 +870,7 @@ line from the hex output use option \fB\-\-hexdump\fR. See also option \fB\-U\f .TP \fB\-x\fR, \fB\-\-line\-regexp\fR Select only those matches that exactly match the whole line, as if -the patterns are surrounded by ^ and $. If a PATTERN is specified, -or \fB\-e\fR \fIPATTERN\fR or \fB\-N\fR \fIPATTERN\fR, then this option has no effect on -\fB\-f\fR \fIFILE\fR patterns to allow \fB\-f\fR \fIFILE\fR patterns to narrow or widen the -scope of the PATTERN search. +the patterns are surrounded by ^ and $. .TP \fB\-\-xml\fR Output file matches in XML. If \fB\-H\fR, \fB\-n\fR, \fB\-k\fR, or \fB\-b\fR is specified, @@ -1329,10 +1322,6 @@ List all Unicode words in a file: .IP $ ugrep -o '\\w+' myfile.txt .PP -List all ASCII words in a file: -.IP -$ ugrep -o '[[:word:]]+' myfile.txt -.PP List the laughing face emojis (Unicode code points U+1F600 to U+1F60F): .IP $ ugrep -o '[\\x{1F600}-\\x{1F60F}]' myfile.txt diff --git a/src/cnf.cpp b/src/cnf.cpp index c728d8f5..885df606 100644 --- a/src/cnf.cpp +++ b/src/cnf.cpp @@ -665,8 +665,12 @@ void CNF::report(FILE *output) const fprintf(output, " does not match"); else fprintf(output, " matches"); - if (flag_files) + if (flag_line_regexp) fprintf(output, " a line"); + else if (flag_word_regexp) + fprintf(output, " a word"); + if (flag_files) + fprintf(output, " on a line"); if (!terms.empty()) { @@ -709,8 +713,12 @@ void CNF::report(FILE *output) const fprintf(output, " does not match"); else fprintf(output, " matches"); - if (flag_files) + if (flag_line_regexp) fprintf(output, " a line"); + else if (flag_word_regexp) + fprintf(output, " a word"); + if (flag_files) + fprintf(output, " on a line"); } or_sep = true; diff --git a/src/cnf.hpp b/src/cnf.hpp index 734b9351..1605f283 100644 --- a/src/cnf.hpp +++ b/src/cnf.hpp @@ -273,11 +273,12 @@ class CNF { const char *xleft = flag_basic_regexp ? "^\\(" : "^(?:"; const char *xright = flag_basic_regexp ? "\\)$" : ")$"; #if defined(HAVE_PCRE2) - const char *wleft = flag_basic_regexp ? "\\<\\(" : flag_perl_regexp ? "(?" : flag_perl_regexp ? ")(?!\\w)" : ")\\>"; + // PCRE2_EXTRA_MATCH_WORD does not work and \b(?:regex)\b is not correct anyway, so we roll out our own + const char *wleft = flag_perl_regexp ? "(?" : flag_perl_regexp ? ")(?![[:word:]])" : ")\\>"; + const char *wleft = flag_perl_regexp ? "(?) + pattern.insert(0, wleft).append(wright); } } diff --git a/src/ugrep-indexer.cpp b/src/ugrep-indexer.cpp index 1cb72c07..e23ba312 100644 --- a/src/ugrep-indexer.cpp +++ b/src/ugrep-indexer.cpp @@ -28,19 +28,19 @@ /** @file ugrep-indexer.cpp -@brief file indexer for the ugrep search utility +@brief file system indexer for the ugrep search utility @author Robert van Engelen - engelen@genivia.com @copyright (c) 2023, Robert van Engelen, Genivia Inc. All rights reserved. @copyright (c) BSD-3 License - see LICENSE.txt */ // DO NOT ALTER THIS LINE: updated by makemake.sh and we need it physically here for MSVC++ build from source -#define UGREP_VERSION "6.0.0" +#define UGREP_VERSION "6.1.0" // use a task-parallel thread to decompress the stream into a pipe to search, also handles nested archives #define WITH_DECOMPRESSION_THREAD -// ignore hidden files and directories in archives, but ugrep will never find them! +// ignore hidden files and directories in archives, but ugrep will never find them anymore when searching hidden! // #define WITH_SKIP_HIDDEN_ARCHIVES // check if we are compiling for a windows OS, but not Cygwin or MinGW @@ -56,7 +56,7 @@ #ifdef OS_WIN // compiling for a windows OS -// disable min/max macros to use std::min and std::max +// disable legacy min/max macros so we can use std::min and std::max #define NOMINMAX #include @@ -72,6 +72,7 @@ #include #include +// 64 bits off_t and fseeko #define off_t int64_t #define fseeko _fseeki64 #define ftruncate _chsize_s @@ -105,7 +106,7 @@ std::wstring utf8_decode(const std::string &str) return wstr; } -// open Unicode wide string UTF-8 encoded filename +// open UTF-8 encoded Unicode filename int fopenw_s(FILE **file, const char *filename, const char *mode) { *file = NULL; @@ -173,7 +174,7 @@ inline uint64_t file_size(const WIN32_FIND_DATAW& ffd) #define PATHSEPCHR '/' #define PATHSEPSTR "/" -// open Unicode wide string UTF-8 encoded filename +// open UTF-8 encoded Unicode filename int fopenw_s(FILE **file, const char *filename, const char *mode) { *file = NULL; @@ -271,8 +272,9 @@ inline unsigned noise_percentage(int accuracy) typedef std::vector StrVec; // fixed constant strings -const char ugrep_index_filename[] = "._UG#_Store"; -const char ugrep_index_file_magic[5] = "UG#\x03"; +static const char ugrep_index_filename[] = "._UG#_Store"; +static const char ugrep_index_file_magic[5] = "UG#\x03"; +static const char ugrep_indexer_config_filename[] = ".ugrep-indexer"; // command-line optional PATH argument const char *arg_pathname = NULL; @@ -710,7 +712,7 @@ void usage(const char *message, const char *arg = NULL) else { ++warnings; - std::cerr << "ugrep-indexer: .ugrep-indexer configuration file: " << message << (arg != NULL ? arg : "") << '\n'; + std::cerr << "ugrep-indexer: " << ugrep_indexer_config_filename << " configuration file: " << message << (arg != NULL ? arg : "") << '\n'; } } @@ -1186,31 +1188,35 @@ void cat(const std::string& pathname, std::stack& dir_entries, std::vecto #endif - // check for ignore files, read them and push globs on the ignore_stack - if (!flag_ignore_files.empty() && !dir_only) + if (!dir_only) { - std::string ignore_filename; - - for (const auto& ignore : flag_ignore_files) + // check for ignore files, read them and push globs on the ignore_stack + if (!flag_ignore_files.empty()) { - ignore_filename.assign(pathname).append(PATHSEPSTR).append(ignore); - - FILE *file = NULL; + std::string filepath; - if (fopenw_s(&file, ignore_filename.c_str(), "r") == 0) + for (const auto& ignore : flag_ignore_files) { - // push globs imported from the ignore file to the back of the vectors - ignore_stack.emplace(); + filepath.assign(pathname).append(PATHSEPSTR).append(ignore); - // mark dir_entries stack with an empty pathname as a sentinel to pop the ignore_stack afterwards - dir_entries.emplace(""); - import_globs(file, ignore_stack.top().files, ignore_stack.top().dirs); - fclose(file); + FILE *file = NULL; + + if (fopenw_s(&file, filepath.c_str(), "r") == 0) + { + // push globs imported from the ignore file to the back of the vectors + ignore_stack.emplace(); + + // mark dir_entries stack with an empty pathname as a sentinel to pop the ignore_stack afterwards + dir_entries.emplace(""); + import_globs(file, ignore_stack.top().files, ignore_stack.top().dirs); + fclose(file); + } } } } ++num_dirs; + std::string entry_pathname; #ifdef OS_WIN @@ -1973,11 +1979,11 @@ void options(int argc, const char **argv) } // load .ugrep-indexer config file when present in the working or home directory -void load_config() +void load_config(const char *config_filename) { // open a local config file or in the home directory FILE *file = NULL; - if (fopenw_s(&file, ".ugrep-indexer", "r") != 0) + if (fopenw_s(&file, config_filename, "r") != 0) { #ifdef OS_WIN const char *home_dir = getenv("USERPROFILE"); @@ -1988,9 +1994,9 @@ void load_config() if (home_dir != NULL) { // open a config file in the home directory - std::string config_file; - config_file.assign(home_dir).append(PATHSEPSTR).append(".ugrep-indexer"); - if (fopenw_s(&file, config_file.c_str(), "r") != 0) + std::string config_filepath; + config_filepath.assign(home_dir).append(PATHSEPSTR).append(config_filename); + if (fopenw_s(&file, config_filepath.c_str(), "r") != 0) file = NULL; } } @@ -2024,15 +2030,18 @@ void load_config() } } + // bail out when config file has errors if (warnings > 0) exit(EXIT_FAILURE); + // reset flag flag_usage_warnings = false; fclose(file); } } +// where the magic happens int main(int argc, const char **argv) { #if !defined(OS_WIN) && defined(HAVE_LIBZ) && defined(WITH_DECOMPRESSION_THREAD) @@ -2040,7 +2049,7 @@ int main(int argc, const char **argv) signal(SIGPIPE, SIG_IGN); #endif - load_config(); + load_config(ugrep_indexer_config_filename); options(argc, argv); diff --git a/src/ugrep.cpp b/src/ugrep.cpp index 4bbf4c28..c17b9f1e 100644 --- a/src/ugrep.cpp +++ b/src/ugrep.cpp @@ -7386,12 +7386,25 @@ void terminal() } } - // --tree: check UTF-8 terminal support + // --tree: check UTF-8 terminal support, this is a guess on LANG or LC_CTYPE or LC_ALL if (flag_tree && (flag_query || flag_files_with_matches || flag_files_without_match || flag_count)) { const char *lang = getenv("LANG"); - if (lang != NULL && strstr(lang, "UTF-8")) + if (lang == NULL || strstr(lang, "UTF-8") == NULL) + { + lang = getenv("LC_CTYPE"); + + if (lang == NULL || strstr(lang, "UTF-8") == NULL) + { + lang = getenv("LC_ALL"); + + if (lang == NULL || strstr(lang, "UTF-8") == NULL) + lang = NULL; + } + } + + if (lang != NULL) { Output::Tree::bar = "│ "; Output::Tree::ptr = "╰╴"; @@ -8001,9 +8014,6 @@ void ugrep() // -f: get patterns from file if (!flag_file.empty()) { - bool line_regexp = flag_line_regexp; - bool word_regexp = flag_word_regexp; - // -F: make newline-separated lines in regex literal with \Q and \E const char *Q = flag_fixed_strings ? "\\Q" : ""; const char *E = flag_fixed_strings ? "\\E|" : flag_basic_regexp ? "\\|" : "|"; @@ -8014,11 +8024,6 @@ void ugrep() // -F does not apply to patterns in -f FILE when PATTERN or -e PATTERN is specified Q = ""; E = flag_basic_regexp ? "\\|" : "|"; - - // -x and -w do not apply to patterns in -f FILE when PATTERN or -e PATTERN is specified - line_regexp = false; - word_regexp = false; - regex.append(E); } @@ -8105,10 +8110,11 @@ void ugrep() regex.pop_back(); } - // -x or -w: if no PATTERN is specified, then apply -x or -w to all -f FILE patterns + // -x or -w: apply to all -f FILE patterns if (regex.empty()) { - if (line_regexp) + // -x: empty regex matches empty lines with ^$ + if (flag_line_regexp) regex.assign("^$"); } else @@ -8117,17 +8123,18 @@ void ugrep() const char *xleft = flag_basic_regexp ? "^\\(" : "^(?:"; const char *xright = flag_basic_regexp ? "\\)$" : ")$"; #if defined(HAVE_PCRE2) - const char *wleft = flag_basic_regexp ? "\\<\\(" : flag_perl_regexp ? "(?" : flag_perl_regexp ? ")(?!\\w)" : ")\\>"; + // PCRE2_EXTRA_MATCH_WORD does not work and \b(?:regex)\b is not correct anyway, so we roll out our own + const char *wleft = flag_perl_regexp ? "(?" : flag_perl_regexp ? ")(?![[:word:]])" : ")\\>"; + const char *wleft = flag_perl_regexp ? "(? match only left side and right side of a word boundary, respectively + // -w: match whole words if (flag_word_regexp) matcher_options.push_back('W'); @@ -8412,7 +8419,7 @@ void ugrep() if (flag_invert_match) matcher_options.clear(); else // -x and --match: only match empty lines (set LineMatcher option W) - matcher_options.push_back('W'); + matcher_options.push_back('X'); // do not invert when searching flag_invert_match = false; @@ -8450,8 +8457,9 @@ void ugrep() // -P: Perl matching with PCRE2 or Boost.Regex #if defined(HAVE_PCRE2) // construct the PCRE2 JIT-optimized NFA-based Perl pattern matcher + uint32_t options = flag_binary ? (PCRE2_NEVER_UTF | PCRE2_NEVER_UCP) : (PCRE2_UTF | PCRE2_UCP); Static::string_pattern.assign(flag_binary ? reflex::PCRE2Matcher::convert(regex, convert_flags, &flag_multiline) : reflex::PCRE2UTFMatcher::convert(regex, convert_flags, &flag_multiline)); - Static::matcher = std::unique_ptr(new reflex::PCRE2Matcher(Static::string_pattern, reflex::Input(), matcher_options.c_str(), flag_binary ? (PCRE2_NEVER_UTF | PCRE2_NEVER_UCP) : (PCRE2_UTF | PCRE2_UCP))); + Static::matcher = std::unique_ptr(new reflex::PCRE2Matcher(Static::string_pattern, reflex::Input(), matcher_options.c_str(), options)); Static::matchers.clear(); if (!Static::bcnf.singleton_or_undefined()) @@ -8469,7 +8477,7 @@ void ugrep() if (j) { subregex.assign(pattern_options).append(*j); - submatchers.emplace_back(new reflex::PCRE2Matcher((flag_binary ? reflex::PCRE2Matcher::convert(subregex, convert_flags) : reflex::PCRE2UTFMatcher::convert(subregex, convert_flags)), reflex::Input(), matcher_options.c_str(), flag_binary ? (PCRE2_NEVER_UTF | PCRE2_NEVER_UCP) : (PCRE2_UTF | PCRE2_UCP))); + submatchers.emplace_back(new reflex::PCRE2Matcher((flag_binary ? reflex::PCRE2Matcher::convert(subregex, convert_flags) : reflex::PCRE2UTFMatcher::convert(subregex, convert_flags)), reflex::Input(), matcher_options.c_str(), options)); } else { @@ -13855,8 +13863,8 @@ void help(std::ostream& out) unless --config=FILE or --no-config is specified.\n\ --no-config\n\ Do not automatically load the default .ugrep configuration file.\n\ - --confirm\n\ - Confirm actions in -Q query TUI. The default is confirm.\n\ + --no-confirm\n\ + Do not confirm actions in -Q query TUI. The default is confirm.\n\ --cpp\n\ Output file matches in C++. See also options --format and -u.\n\ --csv\n\ @@ -14277,7 +14285,7 @@ void help(std::ostream& out) Press Enter to select lines to output. Press ALT-l for option -l\n\ to list files, ALT-n for -n, etc. Non-option commands include\n\ ALT-] to increase context and ALT-} to increase fuzzyness. See\n\ - also options --confirm, --delay, --split and --view.\n\ + also options --no-confirm, --delay, --split and --view.\n\ -q, --quiet, --silent\n\ Quiet mode: suppress all output. Only search a file until a match\n\ has been found.\n\ @@ -14383,12 +14391,8 @@ void help(std::ostream& out) -w, --word-regexp\n\ The PATTERN is searched for as a word, such that the matching text\n\ is preceded by a non-word character and is followed by a non-word\n\ - character. Word characters are letters, digits and the\n\ - underscore. With option -P, word characters are Unicode letters,\n\ - digits and underscore. This option has no effect if -x is also\n\ - specified. If a PATTERN is specified, or -e PATTERN or -N PATTERN,\n\ - then this option has no effect on -f FILE patterns to allow -f FILE\n\ - patterns to narrow or widen the scope of the PATTERN search.\n\ + character. Word-like characters are Unicode letters, digits and\n\ + connector punctuations such as underscore.\n\ --width[=NUM]\n\ Truncate the output to NUM visible characters per line. The width\n\ of the terminal window is used if NUM is not specified. Note that\n\ @@ -14399,10 +14403,7 @@ void help(std::ostream& out) line from the hex output use option --hexdump. See also option -U.\n\ -x, --line-regexp\n\ Select only those matches that exactly match the whole line, as if\n\ - the patterns are surrounded by ^ and $. If a PATTERN is specified,\n\ - or -e PATTERN or -N PATTERN, then this option has no effect on\n\ - -f FILE patterns to allow -f FILE patterns to narrow or widen the\n\ - scope of the PATTERN search.\n\ + the patterns are surrounded by ^ and $.\n\ --xml\n\ Output file matches in XML. If -H, -n, -k, or -b is specified,\n\ additional values are output. See also options --format and -u.\n\ @@ -14509,12 +14510,18 @@ void help(std::ostream& out) // print a helpful information for WHAT, if specified, and exit void help(const char *what) { + // strip = from =WHAT if (what != NULL && *what == '=') ++what; + // strip --no from --no-WHAT if (what != NULL && strncmp(what, "--no", 4) == 0) what += 4; + // strip one dash from --WHAT + if (what != NULL && strncmp(what, "--", 2) == 0) + ++what; + if (what == NULL || *what == '\0') { help(std::cout); diff --git a/src/ugrep.hpp b/src/ugrep.hpp index 20845e15..5e16a0aa 100644 --- a/src/ugrep.hpp +++ b/src/ugrep.hpp @@ -38,7 +38,7 @@ #define UGREP_HPP // DO NOT ALTER THIS LINE: updated by makemake.sh and we need it physically here for MSVC++ build from source -#define UGREP_VERSION "6.0.0" +#define UGREP_VERSION "6.1.0" // disable mmap because mmap is almost always slower than the file reading speed improvements since 3.0.0 #define WITH_NO_MMAP @@ -67,7 +67,7 @@ #ifdef OS_WIN // compiling for a windows OS -// disable min/max macros to use std::min and std::max +// disable legacy min/max macros so we can use std::min and std::max #define NOMINMAX #include @@ -163,7 +163,7 @@ inline char *getcwd0() return strdup(cwd.c_str()); } -// open Unicode wide string UTF-8 encoded filename +// open UTF-8 encoded Unicode filename inline int fopenw_s(FILE **file, const char *filename, const char *mode) { *file = NULL; @@ -258,7 +258,7 @@ inline int dupenv_s(char **ptr, const char *name) return 0; } -// Open Unicode wide string UTF-8 encoded filename +// Open UTF-8 encoded Unicode filename inline int fopenw_s(FILE **file, const char *filename, const char *mode) { *file = NULL; diff --git a/src/zstream.hpp b/src/zstream.hpp index a0dd2629..a226faf9 100644 --- a/src/zstream.hpp +++ b/src/zstream.hpp @@ -1580,7 +1580,9 @@ class zstreambuf : public std::streambuf { if (num > len) num = len; - memcpy(buf, buf_ + cur_, num); + // move decompressed data to destination buf, which can be the same as the source buffer after get_buffer() + if (buf != buf_ + cur_) + memmove(buf, buf_ + cur_, num); cur_ += num; diff --git a/tests/out/lorem.latin1-F-iwco.out b/tests/out/lorem.latin1-F-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.latin1-F-iwco.out +++ b/tests/out/lorem.latin1-F-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/tests/out/lorem.latin1-G-iwco.out b/tests/out/lorem.latin1-G-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.latin1-G-iwco.out +++ b/tests/out/lorem.latin1-G-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/tests/out/lorem.latin1-iwco.out b/tests/out/lorem.latin1-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.latin1-iwco.out +++ b/tests/out/lorem.latin1-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/tests/out/lorem.utf8-F-iwco.out b/tests/out/lorem.utf8-F-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.utf8-F-iwco.out +++ b/tests/out/lorem.utf8-F-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/tests/out/lorem.utf8-G-iwco.out b/tests/out/lorem.utf8-G-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.utf8-G-iwco.out +++ b/tests/out/lorem.utf8-G-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/tests/out/lorem.utf8-iwco.out b/tests/out/lorem.utf8-iwco.out index d6b24041..3c032078 100644 --- a/tests/out/lorem.utf8-iwco.out +++ b/tests/out/lorem.utf8-iwco.out @@ -1 +1 @@ -19 +18 diff --git a/vs/ugrep/ugrep/ugrep.vcxproj b/vs/ugrep/ugrep/ugrep.vcxproj index e756af9b..0decaa62 100755 --- a/vs/ugrep/ugrep/ugrep.vcxproj +++ b/vs/ugrep/ugrep/ugrep.vcxproj @@ -37,6 +37,7 @@ + @@ -54,7 +55,11 @@ - + + + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + @@ -218,7 +223,7 @@ WIN32;NDEBUG;_CONSOLE;WITH_NO_INDENT;WITH_NO_CODEGEN;HAVE_AVX2;HAVE_PCRE2;PCRE2_STATIC;HAVE_LIBZ;HAVE_LIBBZ2;HAVE_LIBLZMA;HAVE_LIBLZ4;HAVE_LIBZSTD;WITH_COLOR;WITH_NO_HIDDEN;ZLIB_WINAPI;NO_GZCOMPRESS;LZMA_API_STATIC;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) true MultiThreaded - $(ProjectDir)\include;$(ProjectDir)\..\pcre2-10.37\src;$(ProjectDir)\..\zlib-1.2.11;$(ProjectDir)\..\bzip2-1.0.5;$(ProjectDir)\..\api;$(ProjectDir)\..\lz4-dev\lib;$(ProjectDir)\..\zstd-dev\lib;$(ProjectDir)\lzma\C;%(AdditionalIncludeDirectories) + $(ProjectDir)\include;$(ProjectDir)\..\pcre2-10.42\src;$(ProjectDir)\..\zlib-1.2.11;$(ProjectDir)\..\bzip2-1.0.5;$(ProjectDir)\..\api;$(ProjectDir)\..\lz4-dev\lib;$(ProjectDir)\..\zstd-dev\lib;$(ProjectDir)\lzma\C;%(AdditionalIncludeDirectories) Speed @@ -246,7 +251,7 @@ NDEBUG;_CONSOLE;WITH_NO_INDENT;WITH_NO_CODEGEN;HAVE_AVX2;HAVE_PCRE2;PCRE2_STATIC;HAVE_LIBZ;HAVE_LIBBZ2;HAVE_LIBLZMA;HAVE_LIBLZ4;HAVE_LIBZSTD;WITH_COLOR;WITH_NO_HIDDEN;ZLIB_WINAPI;NO_GZCOMPRESS;LZMA_API_STATIC;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_WARNINGS;%(PreprocessorDefinitions) true MultiThreaded - $(ProjectDir)\include;$(ProjectDir)\..\pcre2-10.37\src;$(ProjectDir)\..\zlib-1.2.11;$(ProjectDir)\..\bzip2-1.0.5;$(ProjectDir)\..\api;$(ProjectDir)\..\lz4-dev\lib;$(ProjectDir)\..\zstd-dev\lib;$(ProjectDir)\lzma\C + $(ProjectDir)\include;$(ProjectDir)\..\pcre2-10.42\src;$(ProjectDir)\..\zlib-1.2.11;$(ProjectDir)\..\bzip2-1.0.5;$(ProjectDir)\..\api;$(ProjectDir)\..\lz4-dev\lib;$(ProjectDir)\..\zstd-dev\lib;$(ProjectDir)\lzma\C Speed NotSet diff --git a/vs/ugrep/ugrep/ugrep.vcxproj.filters b/vs/ugrep/ugrep/ugrep.vcxproj.filters index 52ac0722..2848720f 100755 --- a/vs/ugrep/ugrep/ugrep.vcxproj.filters +++ b/vs/ugrep/ugrep/ugrep.vcxproj.filters @@ -141,6 +141,12 @@ Source Files + + Source Files + + + Source Files +