diff --git a/build.zig b/build.zig index 84052ba..6ff3207 100644 --- a/build.zig +++ b/build.zig @@ -6,7 +6,7 @@ pub fn build(b: *std.Build) void { const lib = b.addStaticLibrary(.{ .name = "Fluent", - .root_source_file = b.path("fluent.zig"), + .root_source_file = b.path("src/fluent.zig"), .target = target, .optimize = optimize, }); @@ -15,7 +15,7 @@ pub fn build(b: *std.Build) void { const check = b.addStaticLibrary(.{ .name = "Fluent", - .root_source_file = b.path("fluent.zig"), + .root_source_file = b.path("src/fluent.zig"), .target = target, .optimize = optimize, .use_llvm = false, @@ -25,7 +25,7 @@ pub fn build(b: *std.Build) void { check_step.dependOn(&check.step); const lib_unit_tests = b.addTest(.{ - .root_source_file = b.path("fluent.zig"), + .root_source_file = b.path("src/fluent.zig"), .target = target, .optimize = optimize, }); @@ -35,7 +35,7 @@ pub fn build(b: *std.Build) void { test_step.dependOn(&run_lib_unit_tests.step); _ = b.addModule("Fluent", .{ - .root_source_file = b.path("fluent.zig"), + .root_source_file = b.path("src/fluent.zig"), .target = target, .optimize = optimize, }); diff --git a/build.zig.zon b/build.zig.zon index 823ae5d..2f9034c 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -2,7 +2,7 @@ .name = "Fluent", .version = "2.1.0", .paths = .{ - "fluent.zig", + "src", "LICENSE", "README.md", "build.zig.zon", diff --git a/fluent.zig b/src/fluent.zig similarity index 55% rename from fluent.zig rename to src/fluent.zig index 5f33335..9dd7f9f 100644 --- a/fluent.zig +++ b/src/fluent.zig @@ -3,489 +3,103 @@ const Child = std.meta.Child; const Order = std.math.Order; const ReduceOp = std.builtin.ReduceOp; const math = std.math; -// name disambiguation -const Fluent = @This(); - -// This is the new reference for the Enum Type -// pub const Type = union(enum) { -// type: void, -// void: void, -// bool: void, -// noreturn: void, -// int: Int, -// float: Float, -// pointer: Pointer, -// array: Array, -// @"struct": Struct, -// comptime_float: void, -// comptime_int: void, -// undefined: void, -// null: void, -// optional: Optional, -// error_union: ErrorUnion, -// error_set: ErrorSet, -// @"enum": Enum, -// @"union": Union, -// @"fn": Fn, -// @"opaque": Opaque, -// frame: Frame, -// @"anyframe": AnyFrame, -// vector: Vector, -// enum_literal: void, //////////////////////////////////////////////////////////////////////////////// -// Public Fluent Interface Access Point /// +// HELPERS IMPORT /// //////////////////////////////////////////////////////////////////////////////// -pub fn init(slice: anytype) FluentInterface(@TypeOf(slice)) { - return .{ .items = slice }; -} - -fn FluentInterface(comptime T: type) type { - return struct { - const Self = @This(); - - pub const DataType = DeepChild(T); - - pub const SliceType = if (isConst(T)) []const DataType else []DataType; - - items: SliceType, - - pub usingnamespace if (DataType == u8) blk: { - break :blk if (isConst(T)) - ImmutableStringBackend(Self) - else - MutableStringBackend(Self); - } else blk: { - break :blk if (isConst(T)) - ImmutableNumericBackend(Self) - else - MutableNumericBackend(Self); - }; - - pub fn iterator( - self: Self, - comptime mode: IteratorMode, - ) BaseIterator(DataType, mode) { - return Fluent.iterator(mode, self.items); - } - }; -} +const flth = @import("fluent_helpers.zig"); +const add = flth.add; +const mul = flth.mul; +const min = flth.min; +const max = flth.max; +const default = flth.default; +const isConst = flth.isConst; +const isSlice = flth.isSlice; +const isFloat = flth.isFloat; +const identity = flth.identity; +const tupleSize = flth.tupleSize; +const wrapIndex = flth.wrapIndex; +const isInteger = flth.isInteger; +const DeepChild = flth.DeepChild; +const Parameter = flth.Parameter; +const simdReduce = flth.simdReduce; +const reduceInit = flth.reduceInit; +const isUnsigned = flth.isUnsigned; //////////////////////////////////////////////////////////////////////////////// -// Public Fluent Iterator Access Point //// +// REGEX IMPORT /// //////////////////////////////////////////////////////////////////////////////// -/// enum {forward, reverse} -pub const IteratorMode = enum { forward, reverse }; - -pub fn BaseIterator(comptime T: type, mode: IteratorMode) type { - return IteratorInterface(T, mode, void{}, identity); -} - -pub fn iterator( - comptime mode: IteratorMode, - items: anytype, -) BaseIterator(DeepChild(@TypeOf(items)), mode) { - const T = DeepChild(@TypeOf(items)); - - if (comptime !isSlice(@TypeOf(items))) { - return iterator(mode, @as([]const T, items)); - } - - const P = [*c]const T; - - const ptr: P = if (comptime mode == .forward) - @as(P, @ptrCast(items.ptr)) - else - (@as(P, @ptrCast(items.ptr)) + items.len) - 1; - - const end: P = if (comptime mode == .forward) - @as(P, @ptrCast(items.ptr)) + items.len - else - @as(P, @ptrCast(items.ptr)) - 1; - - return .{ - .ptr = ptr, - .end = end, - .stride = 1, - }; -} - -pub fn MatchIterator( - comptime expression: []const u8, -) type { - return struct { - const Self = @This(); - const tree = ParseRegexTree(expression); - items: []const u8, - index: usize, - - pub fn init(items: []const u8) Self { - return .{ .items = items, .index = 0 }; - } - - pub fn next(self: *Self) ?FluentInterface([]const u8) { - while (self.index < self.items.len) : (self.index += 1) { - if (tree.call(self.items, self.index, false)) |last| { - - // non-advancing calls - if (self.index == last) - continue; - - defer self.index = last; - return Fluent.init(self.items[self.index..last]); - } - } - return null; - } - - pub fn span(self: *Self) ?struct { pos: usize, end: usize } { - while (self.index < self.items.len) : (self.index += 1) { - if (tree.call(self.items, self.index, false)) |last| { - - // non-advancing calls - if (self.index == last) - continue; - - defer self.index = last; - return .{ .pos = self.index, .end = last }; - } - } - return null; - } - }; -} - -/// match - match substrings based on an expression -pub fn match( - comptime expression: []const u8, - source: []const u8, -) MatchIterator(expression) { - return MatchIterator(expression).init(source); -} - -fn SplitIterator(comptime expression: []const u8) type { - return struct { - const Self = @This(); - const tree = ParseRegexTree(expression); - items: []const u8, - index: ?usize, - - pub fn init(items: []const u8) Self { - return .{ .items = items, .index = 0 }; - } - - pub fn next(self: *Self) ?FluentInterface([]const u8) { - const start = self.index orelse return null; - var stop: usize = start; - const end: ?usize = blk: { - while (stop < self.items.len) : (stop += 1) { - const last = tree.call(self.items, stop, false) orelse continue; - - // non-advancing calls - if (start == last) - continue; - - break :blk last; - } else break :blk null; - }; - defer self.index = end; - return Fluent.init(self.items[start..stop]); - } - - pub fn span(self: *Self) struct { pos: usize, end: usize } { - const start = self.index orelse return null; - var stop: usize = start; - const end: ?usize = blk: { - while (stop < self.items.len) : (stop += 1) { - const last = tree.call(self.items, stop, false) orelse continue; - - // non-advancing calls - if (start == last) - continue; - - break :blk last; - } else break :blk null; - }; - defer self.index = end; - return .{ .pos = start, .end = stop }; - } - }; -} - -/// split - splits a string based on a delimiting expression -pub fn split( - comptime expression: []const u8, - source: []const u8, -) SplitIterator(expression) { - return SplitIterator(expression).init(source); -} +const fltregx = @import("fluent_regex.zig"); +const ParseRegexTree = fltregx.ParseRegexTree; //////////////////////////////////////////////////////////////////////////////// -// UNARY FUNCTION ADAPTER : // +// UNARY FN ADAPTER /// //////////////////////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////////////// -// chain: combine multiple unary functions into a single in-order call - -pub fn Chain( - comptime unary_tuple: anytype, -) type { - return struct { - pub fn call(x: anytype) @TypeOf(@call(.auto, unwrap, .{ 0, unary_tuple, default(@TypeOf(x)) })) { - return @call(.always_inline, unwrap, .{ 0, unary_tuple, x }); - } - }; -} - -fn unwrap( - comptime pos: usize, - comptime unary_tuple: anytype, - arg: anytype, -) if (pos < tupleSize(unary_tuple)) - @TypeOf(unary_tuple[pos](default(@TypeOf(arg)))) -else - @TypeOf(arg) { - // this is a forward-unwrap that passes - // outcomes of one function to the next - if (comptime pos == tupleSize(unary_tuple)) { - return arg; - } - return @call(.always_inline, unwrap, .{ (pos + 1), unary_tuple, @call(.always_inline, unary_tuple[pos], .{arg}) }); -} - -////////////////////////////////////////////////////////////////////// -// bind: affix comptime arguments to the front of a function - -pub fn bind( - comptime bind_tuple: anytype, - comptime function: anytype, -) BindRetun(bind_tuple, function) { - const bind_count = comptime tupleSize(bind_tuple); - const total_count = comptime @typeInfo(@TypeOf(function)).@"fn".params.len; - - if (comptime total_count - bind_count == 1) { - return struct { - pub fn call(x: anytype) @TypeOf(x) { - return @call(.always_inline, function, bind_tuple ++ .{x}); - } - }.call; - } else { - return struct { - pub fn call(x: anytype, y: anytype) @TypeOf(x) { - return @call(.always_inline, function, bind_tuple ++ .{ x, y }); - } - }.call; - } -} - -fn BindRetun( - comptime bind_tuple: anytype, - comptime function: anytype, -) type { - const total_count = comptime @typeInfo(@TypeOf(function)).@"fn".params.len; - const bind_count = comptime tupleSize(bind_tuple); - - if (comptime total_count < bind_count) - @compileError("too many arguments to bind"); - - if (comptime total_count - bind_count > 2) - @compileError("fluent bind must result in unary or binary function"); - - const choices = struct { - pub fn unary(x: anytype) @TypeOf(x) { - return x; - } - pub fn binary(x: anytype, y: anytype) @TypeOf(x) { - _ = &y; - return x; - } - }; - return if (comptime (total_count - bind_count) == 1) - @TypeOf(choices.unary) - else - @TypeOf(choices.binary); -} +const fltfnadapt = @import("fluent_unary_fn_adapter.zig"); +const bind = fltfnadapt.bind; +const Chain = fltfnadapt.Chain; +const unwrap = fltfnadapt.unwrap; +const BindReturn = fltfnadapt.BindReturn; //////////////////////////////////////////////////////////////////////////////// -// Backends and Implementation // +// ITERATOR IMPORTS /// //////////////////////////////////////////////////////////////////////////////// +const fltiter = @import("fluent_iterator.zig"); +const split = fltiter.split; +const match = fltiter.match; +const iterator = fltiter.iterator; +const BaseIterator = fltiter.BaseIterator; +const MatchIterator = fltiter.MatchIterator; +const SplitIterator = fltiter.SplitIterator; +const IteratorInterface = fltiter.IteratorInterface; +const IteratorMode = fltiter.IteratorMode; //////////////////////////////////////////////////////////////////////////////// -// Iterator Interface Implementation: // +// ENUM IMPORTS /// //////////////////////////////////////////////////////////////////////////////// +const fltenum = @import("fluent_enum.zig"); +const StringMode = fltenum.StringMode; +const DirectionOption = fltenum.DirectionOption; +const ReplaceOption = fltenum.ReplaceOption; +const TrimOptions = fltenum.TrimOptions; +const SortDirection = fltenum.SortDirection; +const SampleOption = fltenum.SampleOption; +const FluentMode = fltenum.FluentMode; -fn IteratorInterface( - comptime DataType: type, - mode: IteratorMode, - comptime filters: anytype, // tuple or function - comptime transforms: anytype, // tuple or function -) type { - return struct { - const Self = @This(); - const Mode = mode; - - ptr: [*c]const DataType, - end: [*c]const DataType, - stride: usize, - - pub fn next(self: *Self) ?DataType { - if (comptime @TypeOf(filters) != void) { - // apply single filter or tuple of filters - switch (comptime @typeInfo(@TypeOf(filters))) { - .@"fn" => { - if (comptime Mode == .forward) { - while (self.ptr < self.end and !filters(self.ptr.*)) - self.ptr += self.stride; - } else { - while (self.ptr > self.end and !filters(self.ptr.*)) - self.ptr -= self.stride; - } - }, - else => outer: { // applies inline filters - if (comptime Mode == .forward) { - inner: while (self.ptr < self.end) : (self.ptr += self.stride) { - inline for (filters) |f| { - if (!f(self.ptr.*)) continue :inner; - } - break :outer; - } - } else { - inner: while (self.ptr > self.end) : (self.ptr -= self.stride) { - inline for (filters) |f| { - if (!f(self.ptr.*)) continue :inner; - } - break :outer; - } - } - }, - } - } +//////////////////////////////////////////////////////////////////////////////// +// STATIC BITSET IMPORTS /// +//////////////////////////////////////////////////////////////////////////////// +const fltstbset = @import("fluent_static_bitset.zig"); - // unpack transforms into single transform call - const transform = comptime if (@typeInfo(@TypeOf(transforms)) == .@"fn") - transforms - else - Fluent.Chain(transforms).call; +const StringBitSet = fltstbset.StringBitSet; - switch (comptime Mode) { - .forward => { - if (self.ptr < self.end) { - defer self.ptr += self.stride; - return @call(.always_inline, transform, .{self.ptr.*}); - } - }, - .reverse => { - if (self.ptr > self.end) { - defer self.ptr -= self.stride; - return @call(.always_inline, transform, .{self.ptr.*}); - } - }, - } - return null; - } +//////////////////////////////////////////////////////////////////////////////// +// Public Fluent Interface Access Point /// +//////////////////////////////////////////////////////////////////////////////// - /// strided - set iterator stride (default 1) - pub fn strided( - self: Self, - stride_size: usize, - ) Self { - return .{ - .ptr = self.ptr, - .end = self.end, - .stride = stride_size, - }; - } +pub const Fluent = @This(); - /// window - return a slice and advance by stride - pub fn window( - self: *Self, - window_size: usize, - ) ?FluentInterface([]const DataType) { - switch (comptime Mode) { - .forward => { - if (self.ptr + window_size <= self.end) { - defer _ = self.next(); - return Fluent.init(self.ptr[0..window_size]); - } - }, - .reverse => { - if ((self.ptr + 1) - window_size > self.end) { - defer _ = self.next(); - return Fluent.init(((self.ptr + 1) - window_size)[0..window_size]); - } - }, - } - return null; - } +pub fn init(slice: anytype) FluentInterface(@TypeOf(slice)) { + return .{ .items = slice }; +} - /// map - transforms every elment in the acquired slice with a given unary function - pub fn map( - self: Self, - comptime new_transforms: anytype, - ) IteratorInterface(DataType, Mode, filters, new_transforms) { - return .{ - .ptr = self.ptr, - .end = self.end, - .stride = self.stride, - }; - } +pub fn FluentInterface(comptime T: type) type { + return struct { + const Self = @This(); - /// filter - acquire a unary predicate or a tuple of unary predicates - pub fn filter( - self: Self, - comptime new_filters: anytype, - ) IteratorInterface(DataType, Mode, new_filters, transforms) { - return .{ - .ptr = self.ptr, - .end = self.end, - .stride = self.stride, - }; - } + pub const DataType = DeepChild(T); - pub fn write( - self: anytype, // for both const and non-const pointers - items: []DataType, - ) usize { - // enable chaining without temporaries - if (comptime isConst(@TypeOf(self))) { - var tmp = self.*; - return tmp.write(items); - } - var count: usize = 0; - while (count < items.len) : (count += 1) { - items[count] = self.next() orelse return count; - } - return count; - } + pub const SliceType = if (isConst(T)) []const DataType else []DataType; - pub fn reduce( - self: anytype, // for both const and non-const pointers - comptime T: type, - comptime binary_func: anytype, // single binary function - initial: T, - ) T { - // enable chaining without temporaries - if (comptime isConst(@TypeOf(self))) { - var tmp = self.*; - return tmp.reduce(T, binary_func, initial); - } - var rdx = initial; - while (self.next()) |x| { - rdx = @call(.always_inline, binary_func, .{ rdx, x }); - } - return rdx; - } - }; -} + items: SliceType, -//////////////////////////////////////////////////// -// GeneralBackend ////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + /// GeneralImmutableBackend /// + //////////////////////////////////////////////////////////////////////////////// -pub fn GeneralImmutableBackend(comptime Self: type) type { - return struct { - /// all - check if all elements of the acquired slice are true by given predicate pub fn all(self: Self, predicate: fn (Self.DataType) bool) bool { return for (self.items) |x| { if (!predicate(x)) break false; @@ -530,11 +144,6 @@ pub fn GeneralImmutableBackend(comptime Self: type) type { } } - // NOTE: - // using slices here because this makes it directly - // obvious that we're support any kind of slice and - // both Mutable and Immutable backends. - ///order - returns the lexicographical order compared to a given slice pub fn order(self: Self, items: []const Self.DataType) Order { return std.mem.order(Self.DataType, self.items, items); @@ -670,66 +279,12 @@ pub fn GeneralImmutableBackend(comptime Self: type) type { } return .{ .items = join_buffer[0..curr_idx] }; } - }; -} - -//////////////////////////////////////////////////////////////////////////////// -// IMMUTABLE BACKEND : // -// // -// Used by mutable backend - only suports non-mutating // -// operations over items. Primarily used for reducing, // -// scanning, and indexing. Provides non-mutating iterator // -// support for both Immutable and Mutable backends. // -//////////////////////////////////////////////////////////////////////////////// - -fn ImmutableNumericBackend(comptime Self: type) type { - return struct { - pub usingnamespace GeneralImmutableBackend(Self); - - /////////////////////// - // PUBLIC SECTION // - /////////////////////// - - /// findFrom - returns first index after a given position of scalar, slice, or any - pub fn findFrom( - self: Self, - comptime mode: FluentMode, - start_index: usize, - needle: Parameter(Self.DataType, mode), - ) ?usize { - return switch (mode) { - .any => std.mem.indexOfAnyPos(Self.DataType, self.items, start_index, needle), - .scalar => std.mem.indexOfScalarPos(Self.DataType, self.items, start_index, needle), - .sequence => std.mem.indexOfPos(Self.DataType, self.items, start_index, needle), - }; - } - - /// containsFrom - check if contains a given scalar, sequence, or any after a given index - pub fn containsFrom( - self: Self, - comptime mode: FluentMode, - start_index: usize, - needle: Parameter(Self.DataType, mode), - ) bool { - return findFrom(self, mode, start_index, needle) != null; - } - /// find - returns first index of scalar, slice, or any - pub fn find( - self: Self, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) ?usize { - return findFrom(self, mode, 0, needle); - } - - /// contains - check if contains a given scalar, sequence, or any - pub fn contains( + pub fn iterator( self: Self, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) bool { - return find(self, mode, needle) != null; + comptime mode: IteratorMode, + ) BaseIterator(DataType, mode) { + return Fluent.iterator(mode, self.items); } /// startsWith - checks if the acquired slice starts with a scalar, sequence, or any @@ -772,222 +327,32 @@ fn ImmutableNumericBackend(comptime Self: type) type { }; } - /// count - counts all, left, right given a scalar, sequence, or any - pub fn count( - self: Self, - direction: DirectionOption, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) usize { - if (self.items.len == 0) return 0; + pub fn sort(self: Self, comptime direction: SortDirection) Self { + const func = if (direction == .asc) + std.sort.asc(Self.DataType) + else + std.sort.desc(Self.DataType); - return switch (direction) { - .all => countAll(self, mode, needle), - .left => countLeft(self, mode, needle), - .right => countRight(self, mode, needle), - }; + std.sort.pdq(Self.DataType, self.items, void{}, func); + return self; } - /// trim - trims left, right, or all based on any, sequence, or scalar - pub fn trim( - self: Self, - comptime direction: DirectionOption, - comptime option: TrimOptions, - comptime needle: Parameter(Self.DataType, option), - ) Self { - if (self.items.len == 0) return self; - return switch (direction) { - .left => .{ .items = self.items[trimLeft(self, option, needle)..] }, - .right => .{ .items = self.items[0..trimRight(self, option, needle)] }, - .all => self.trim(.left, option, needle).trim(.right, option, needle), - }; + /// fill - fills the acquired slice with a scalar value + pub fn fill(self: Self, scalar: Self.DataType) Self { + @memset(self.items, scalar); + return self; } - /////////////////////////////////////////////////// - // Iterator support /////////////////////////////// - - pub fn split( - self: Self, - comptime mode: std.mem.DelimiterType, - delimiter: Parameter(Self.DataType, mode), - ) std.mem.SplitIterator(Self.DataType, mode) { - return .{ .index = 0, .buffer = self.items, .delimiter = delimiter }; + /// copy - copy a given slice into the acquired slice + pub fn copy(self: Self, items: []const Self.DataType) Self { + std.debug.assert(self.items.len >= items.len); + @memcpy(self.items[0..items.len], items); + return .{ .items = self.items[0..items.len] }; } - /////////////////////// - // PRIVATE SECTION // - /////////////////////// - - fn trimLeft( - self: Self, - comptime opt: TrimOptions, - actor: Parameter(Self.DataType, opt), - ) usize { - var start: usize = 0; - const end: usize = self.items.len; - switch (opt) { - .scalar => { - while (start < end and self.items[start] == actor) start += 1; - }, - .predicate => { - while (start < end and actor(self.items[start])) start += 1; - }, - .any => { - while (start < end and std.mem.indexOfScalar(Self.DataType, actor, self.items[start]) != null) start += 1; - }, - } - return start; - } - - fn trimRight( - self: Self, - comptime opt: TrimOptions, - actor: Parameter(Self.DataType, opt), - ) usize { - const start: usize = 0; - var end: usize = self.items.len; - switch (opt) { - .scalar => { - while (end > start and self.items[end - 1] == actor) end -= 1; - }, - .predicate => { - while (end > start and actor(self.items[end - 1])) end -= 1; - }, - .any => { - while (start < end and std.mem.indexOfScalar(Self.DataType, actor, self.items[end - 1]) != null) end -= 1; - }, - } - return end; - } - - fn countAll( - self: Self, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) usize { - var result: usize = 0; - - switch (mode) { - .scalar => { - for (self.items) |it| { - if (it == needle) result += 1; - } - }, - .sequence => result = std.mem.count(Self.DataType, self.items, needle), - .any => { - for (self.items) |it| { - for (needle) |n| { - if (it == n) result += 1; - } - } - }, - } - return (result); - } - - fn countLeft( - self: Self, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) usize { - var result: usize = 0; - switch (mode) { - .scalar => { - for (self.items, 0..) |it, i| { - if (it != needle) return (i); - } - }, - .sequence => { - var win_iter = std.mem.window(Self.DataType, self.items, needle.len, needle.len); - while (win_iter.next()) |win| : (result += 1) { - if (std.mem.eql(Self.DataType, win, needle) == false) break; - } - }, - .any => { - for (self.items) |it| { - if (std.mem.containsAtLeast(Self.DataType, needle, 1, &[_]Self.DataType{it}) == false) break; - result += 1; - } - }, - } - return (result); - } - - fn countRight( - self: Self, - comptime mode: FluentMode, - needle: Parameter(Self.DataType, mode), - ) usize { - var result: usize = 0; - switch (mode) { - .scalar => { - var itr = Fluent.iterator(.reverse, self.items); - while (itr.next()) |item| : (result += 1) { - if (item != needle) break; - } - }, - .sequence => { - if (self.items.len < needle.len) return 0; - var start = self.items.len - needle.len; - while (start != 0) : (start -|= needle.len) { - const win = self.items[start .. start + needle.len]; - if (!std.mem.eql(Self.DataType, win, needle)) break; - result += 1; - } - }, - .any => { - var itr = Fluent.iterator(.reverse, self.items); - while (itr.next()) |item| : (result += 1) { - if (!std.mem.containsAtLeast(Self.DataType, needle, 1, &[_]Self.DataType{item})) break; - } - }, - } - return result; - } - }; -} - -//////////////////////////////////////////////////////////////////////////////// -// MUTABLE BACKEND // -// // -// Only suports mutating operations on items. // -// Operations include sorting, replacing, // -// permutations, and partitioning. // -//////////////////////////////////////////////////////////////////////////////// - -pub fn GeneralMutableBackend(comptime Self: type) type { - return struct { - - // includes operations like reduce, find, and iterators - pub usingnamespace GeneralImmutableBackend(Self); - - /// sort - sorts the range in ascending or descending order - pub fn sort(self: Self, comptime direction: SortDirection) Self { - const func = if (direction == .asc) - std.sort.asc(Self.DataType) - else - std.sort.desc(Self.DataType); - - std.sort.pdq(Self.DataType, self.items, void{}, func); - return self; - } - - /// fill - fills the acquired slice with a scalar value - pub fn fill(self: Self, scalar: Self.DataType) Self { - @memset(self.items, scalar); - return self; - } - - /// copy - copy a given slice into the acquired slice - pub fn copy(self: Self, items: []const Self.DataType) Self { - std.debug.assert(self.items.len >= items.len); - @memcpy(self.items[0..items.len], items); - return .{ .items = self.items[0..items.len] }; - } - - /// rotate - rotates the array by both negative and positive amounts - pub fn rotate(self: Self, amount: anytype) Self { - const len = self.items.len; + /// rotate - rotates the array by both negative and positive amounts + pub fn rotate(self: Self, amount: anytype) Self { + const len = self.items.len; const rot_amt: usize = blk: { if (amount > 0) { @@ -1030,32 +395,104 @@ pub fn GeneralMutableBackend(comptime Self: type) type { random.shuffle(Self.DataType, self.items); return self; } - }; -} -fn MutableNumericBackend(comptime Self: type) type { - return struct { - pub usingnamespace ImmutableNumericBackend(Self); + /// lower - transform all alphabetic characters to lower case + pub fn lower(self: Self) Self { + for (self.items) |*c| c.* = std.ascii.toLower(c.*); + return self; + } - pub usingnamespace GeneralMutableBackend(Self); - }; -} + /// upper - transform all alphabetic characters to upper case + pub fn upper(self: Self) Self { + for (self.items) |*c| c.* = std.ascii.toUpper(c.*); + return self; + } -//////////////////////////////////////////////////////////////////////////////// -// IMMUTABLE BACKEND : // -// // -// Only activated if the child data type is u8 // -//////////////////////////////////////////////////////////////////////////////// + /// replaces values in string with a provided string. + /// Panics if the replacement string size is larger than + /// the minimum number of possible matches. + pub fn replace( + self: Self, + comptime mode: StringMode, + comptime needle: Parameter(u8, mode), + replacement: Parameter(u8, mode), + ) Self { + if (self.items.len == 0) return self; + + switch (mode) { + .scalar => { + std.mem.replaceScalar(u8, self.items, needle, replacement); + return self; + }, + .regex => { + const tree = comptime ParseRegexTree(needle); + const min_matches = comptime tree.minMatches(); -const StringMode = enum { regex, scalar }; + if (comptime min_matches == 0) + @compileError("replacment matches must be greater than zero"); -fn ImmutableStringBackend(comptime Self: type) type { - return struct { - pub usingnamespace GeneralImmutableBackend(Self); + std.debug.assert(replacement.len <= min_matches); - /////////////////////// - // PUBLIC SECTION // - /////////////////////// + var r: usize = 0; // read + var w: usize = 0; // write + + while (r < self.items.len) { + if (tree.call(self.items, r, false)) |match_end| { + if (r == match_end) { + r += 1; + continue; + } + // copy from where the write head starts + @memcpy(self.items[w..][0..replacement.len], replacement); + // advance write head by size of replacement + w += replacement.len; + // start next read at last match's end + r = match_end; + + continue; + } + + self.items[w] = self.items[r]; + r += 1; + w += 1; + } + + return .{ .items = self.items[0..w] }; + }, + } + } + + /// capitalize - transform first character to upper case and rest to lower case + pub fn capitalize(self: Self) Self { + if (self.items.len > 0) + self.items[0] = std.ascii.toUpper(self.items[0]); + if (self.items.len > 1) + for (self.items[1..]) |*c| { + c.* = std.ascii.toLower(c.*); + }; + return self; + } + + /// title - capitalize each sequence separated by spaces + pub fn title(self: Self) Self { + var i: usize = 0; + var prev: u8 = ' '; + while (i < self.items.len) : (i += 1) { + switch (self.items[i]) { + 'A'...'Z' => { + if (!std.ascii.isWhitespace(prev)) + self.items[i] += 32; + }, + 'a'...'z' => { + if (std.ascii.isWhitespace(prev)) + self.items[i] -= 32; + }, + else => {}, + } + prev = self.items[i]; + } + return self; + } /// isDigit - returns true for [0-9]+ pub fn isDigit(self: Self) bool { @@ -1103,26 +540,26 @@ fn ImmutableStringBackend(comptime Self: type) type { } /// digit - parses the string as an integer in base 10 - pub fn digit(self: Self, comptime T: type) !T { - if (comptime !isInteger(T)) + pub fn digit(self: Self, comptime K: type) !K { + if (comptime !isInteger(K)) @compileError("digit: requires integer type."); - return std.fmt.parseInt(T, self.items, 10); + return std.fmt.parseInt(K, self.items, 10); } /// float - parses the string as a floating-point number - pub fn float(self: Self, comptime T: type) !T { - if (comptime !isFloat(T)) + pub fn float(self: Self, comptime K: type) !K { + if (comptime !isFloat(K)) @compileError("float: requires floating-point type."); - return std.fmt.parseFloat(T, self.items); + return std.fmt.parseFloat(K, self.items); } /// float - parses the string as a floating-point number - pub fn cast(self: Self, comptime T: type) !T { - return switch (@typeInfo(T)) { - .int => self.digit(T), - .float => self.float(T), + pub fn cast(self: Self, comptime K: type) !K { + return switch (@typeInfo(K)) { + .int => self.digit(K), + .float => self.float(K), else => @compileError("cast: requires floating point or integer types."), }; } @@ -1419,1396 +856,6 @@ fn ImmutableStringBackend(comptime Self: type) type { }; } -//////////////////////////////////////////////////////////////////////////////// -// MUTABLE BACKEND : // -// // -// Only activated if the child data type is u8 // -//////////////////////////////////////////////////////////////////////////////// - -fn MutableStringBackend(comptime Self: type) type { - return struct { - - /////////////////////// - // PUBLIC SECTION // - /////////////////////// - - pub usingnamespace ImmutableStringBackend(Self); - - pub usingnamespace GeneralMutableBackend(Self); - - /// lower - transform all alphabetic characters to lower case - pub fn lower(self: Self) Self { - for (self.items) |*c| c.* = std.ascii.toLower(c.*); - return self; - } - - /// upper - transform all alphabetic characters to upper case - pub fn upper(self: Self) Self { - for (self.items) |*c| c.* = std.ascii.toUpper(c.*); - return self; - } - - /// replaces values in string with a provided string. - /// Panics if the replacement string size is larger than - /// the minimum number of possible matches. - pub fn replace( - self: Self, - comptime mode: StringMode, - comptime needle: Parameter(u8, mode), - replacement: Parameter(u8, mode), - ) Self { - if (self.items.len == 0) return self; - - switch (mode) { - .scalar => { - std.mem.replaceScalar(u8, self.items, needle, replacement); - return self; - }, - .regex => { - const tree = comptime ParseRegexTree(needle); - const min_matches = comptime tree.minMatches(); - - if (comptime min_matches == 0) - @compileError("replacment matches must be greater than zero"); - - std.debug.assert(replacement.len <= min_matches); - - var r: usize = 0; // read - var w: usize = 0; // write - - while (r < self.items.len) { - if (tree.call(self.items, r, false)) |match_end| { - if (r == match_end) { - r += 1; - continue; - } - // copy from where the write head starts - @memcpy(self.items[w..][0..replacement.len], replacement); - // advance write head by size of replacement - w += replacement.len; - // start next read at last match's end - r = match_end; - - continue; - } - - self.items[w] = self.items[r]; - r += 1; - w += 1; - } - - return .{ .items = self.items[0..w] }; - }, - } - } - - /// capitalize - transform first character to upper case and rest to lower case - pub fn capitalize(self: Self) Self { - if (self.items.len > 0) - self.items[0] = std.ascii.toUpper(self.items[0]); - if (self.items.len > 1) - for (self.items[1..]) |*c| { - c.* = std.ascii.toLower(c.*); - }; - return self; - } - - /// title - capitalize each sequence separated by spaces - pub fn title(self: Self) Self { - var i: usize = 0; - var prev: u8 = ' '; - while (i < self.items.len) : (i += 1) { - switch (self.items[i]) { - 'A'...'Z' => { - if (!std.ascii.isWhitespace(prev)) - self.items[i] += 32; - }, - 'a'...'z' => { - if (std.ascii.isWhitespace(prev)) - self.items[i] -= 32; - }, - else => {}, - } - prev = self.items[i]; - } - return self; - } - - /////////////////////// - // PRIVATE SECTION // - /////////////////////// - - }; -} - -////////////////////////////////////////////////////////////////////////////////// -// STRING BIT SET : // -////////////////////////////////////////////////////////////////////////////////// - -const StringBitSet = struct { - const BackingSet = std.StaticBitSet(@bitSizeOf(usize)); - - bits: [4]BackingSet, - - /// init - returns an initEmpty instance of StringBitSet - pub fn init() StringBitSet { - return .{ .bits = .{ - BackingSet.initEmpty(), - BackingSet.initEmpty(), - BackingSet.initEmpty(), - BackingSet.initEmpty(), - } }; - } - - /// setValue - sets the value of the bit at the specified position - pub fn setValue(self: *StringBitSet, pos: usize, value: bool) void { - const mod_pos = pos & 63; - switch (pos) { - 0...63 => self.bits[0].setValue(mod_pos, value), - 64...127 => self.bits[1].setValue(mod_pos, value), - 128...191 => self.bits[2].setValue(mod_pos, value), - 192...255 => self.bits[3].setValue(mod_pos, value), - else => unreachable, - } - } - - /// isSet - checks if the bit at the specified position is set - pub fn isSet(self: *const StringBitSet, pos: usize) bool { - const mod_pos = pos & 63; - return switch (pos) { - 0...63 => self.bits[0].isSet(mod_pos), - 64...127 => self.bits[1].isSet(mod_pos), - 128...191 => self.bits[2].isSet(mod_pos), - 192...255 => self.bits[3].isSet(mod_pos), - else => unreachable, - }; - } - - /// unionWith - computes the union of two StringBitSets - pub fn unionWith(self: StringBitSet, other: StringBitSet) StringBitSet { - return .{ .bits = .{ - self.bits[0].unionWith(other.bits[0]), - self.bits[1].unionWith(other.bits[1]), - self.bits[2].unionWith(other.bits[2]), - self.bits[3].unionWith(other.bits[3]), - } }; - } - - /// differenceWith - computes the difference of two StringBitSets - pub fn differenceWith(self: StringBitSet, other: StringBitSet) StringBitSet { - return .{ .bits = .{ - self.bits[0].differenceWith(other.bits[0]), - self.bits[1].differenceWith(other.bits[1]), - self.bits[2].differenceWith(other.bits[2]), - self.bits[3].differenceWith(other.bits[3]), - } }; - } - - /// intersectWith - computes the intersection of two StringBitSets - pub fn intersectWith(self: StringBitSet, other: StringBitSet) StringBitSet { - return .{ .bits = .{ - self.bits[0].intersectWith(other.bits[0]), - self.bits[1].intersectWith(other.bits[1]), - self.bits[2].intersectWith(other.bits[2]), - self.bits[3].intersectWith(other.bits[3]), - } }; - } - - /// count - counts the number of set bits in the StringBitSet - pub fn count(self: StringBitSet) usize { - return self.bits[0].count() + self.bits[1].count() + self.bits[2].count() + self.bits[3].count(); - } - - /// fillBuffer - fills a buffer with the values of set bits in the StringBitSet - pub fn fillBuffer(self: *const StringBitSet, buffer: []u8) []u8 { - var val: usize = 0; - var pos: usize = 0; - while (val < 256) : (val += 1) { - if (self.isSet(val)) { - buffer[pos] = @intCast(val); - pos += 1; - } - } - return buffer[0..pos]; - } -}; - -////////////////////////////////////////////////////////////////////////////////// -// ENUMERATED OPTIONS : // -////////////////////////////////////////////////////////////////////////////////// - -const DirectionOption = enum { - all, - left, - right, -}; - -const ReplaceOption = enum { - first, - last, - all, - periphery, -}; - -const TrimOptions = enum { - scalar, - predicate, - any, -}; - -const SortDirection = enum { - asc, - desc, -}; - -const SampleOption = enum { - scalar, - sequence, -}; - -// any, sequence, scalar -const FluentMode = std.mem.DelimiterType; - -//////////////////////////////////////////////////////////////////////////////// -// PRIVATE HELPERS : // -//////////////////////////////////////////////////////////////////////////////// - -fn isConst(comptime T: type) bool { - switch (@typeInfo(T)) { - .pointer => |ptr| return ptr.is_const, - else => @compileError("Type must coercible to a slice."), - } -} - -fn isSlice(comptime T: type) bool { - return switch (@typeInfo(T)) { - .pointer => |ptr| ptr.size == .Slice, - else => false, - }; -} - -fn isInteger(comptime T: type) bool { - return switch (@typeInfo(T)) { - .int, .comptime_int => true, - else => false, - }; -} - -fn isUnsigned(comptime T: type) bool { - return switch (@typeInfo(T)) { - .int => |i| i.signedness == .unsigned, - else => false, - }; -} - -fn tupleSize(comptime tuple: anytype) usize { - return switch (@typeInfo(@TypeOf(tuple))) { - .@"struct" => |s| s.fields.len, - else => @compileError("type must be a tuple"), - }; -} - -fn default(comptime T: type) T { - if (comptime T == bool) { - return true; - } - return 0; -} - -// bypasses iterator transform -inline fn identity(x: anytype) @TypeOf(x) { - return x; -} - -fn isFloat(comptime T: type) bool { - return switch (@typeInfo(T)) { - .float => true, - else => false, - }; -} - -fn Parameter(comptime T: type, comptime mode: anytype) type { - const param_types = std.StaticStringMap(type).initComptime(.{ - .{ "any", []const T }, - .{ "scalar", T }, - .{ "sequence", []const T }, - .{ "range", struct { start: usize, end: usize } }, - .{ "predicate", fn (T) bool }, - .{ "regex", []const u8 }, - }); - return comptime param_types.get(@tagName(mode)) orelse unreachable; -} - -// checks if we are pointing to an array -fn DeepChild(comptime T: type) type { - // TODO: consider comptime support, should be Immutable only.. - - const C = Child(T); - - return switch (@typeInfo(C)) { - .int, .float => C, - .array => |a| a.child, - else => @compileError("Unsupported Type"), - }; -} - -inline fn wrapIndex(len: usize, idx: anytype) usize { - switch (@typeInfo(@TypeOf(idx))) { - .int => |i| { - if (comptime i.signedness == .unsigned) { - return idx; - } else { - const u: usize = @abs(idx); - return if (idx < 0) len - u else u; - } - }, - .comptime_int => { - const u: usize = comptime @abs(idx); - return if (comptime idx < 0) len - u else u; - }, - else => @compileError("Index must be an integer type parameter."), - } -} - -inline fn reduceInit(comptime op: ReduceOp, comptime T: type) T { - const info = @typeInfo(T); - - return switch (op) { - .Add => 0, // implicit cast - .Mul => 1, // implicit cast - .Min => if (comptime info == .int) - math.maxInt(T) - else - math.inf(T), - .Max => if (comptime info == .int) - math.minInt(T) - else - -math.inf(T), - else => @compileError("reduceInit: unsupported op"), - }; -} - -fn simdReduce( - comptime T: type, - comptime ReduceType: anytype, - comptime BinaryFunc: anytype, - items: []const T, - initial: T, -) T { - // Special thanks to the user "nyc" over at the Ziggit forum - - var ptr: [*c]const T = @ptrCast(items.ptr); - const end: [*c]const T = ptr + items.len; - - var rdx: T = blk: { - if (comptime std.simd.suggestVectorLength(T)) |N| { - if (items.len < N) - break :blk initial; - - var vec_rdx: @Vector(N, T) = @splat(initial); - - while (ptr + N <= end) : (ptr += N) { - vec_rdx = @call(.always_inline, BinaryFunc, .{ vec_rdx, @as(*const @Vector(N, T), @ptrCast(@alignCast(ptr))).* }); - } - break :blk @reduce(ReduceType, vec_rdx); - } else { - break :blk initial; - } - }; - - while (ptr < end) : (ptr += 1) { - rdx = @call(.always_inline, BinaryFunc, .{ rdx, ptr.* }); - } - - return rdx; -} - -// these work for @Vector as well as scalar types -pub inline fn max(x: anytype, y: anytype) @TypeOf(x, y) { - return @max(x, y); -} -pub inline fn min(x: anytype, y: anytype) @TypeOf(x, y) { - return @min(x, y); -} -pub inline fn add(x: anytype, y: anytype) @TypeOf(x, y) { - return x + y; -} -pub inline fn sub(x: anytype, y: anytype) @TypeOf(x, y) { - return x - y; -} -pub inline fn div(x: anytype, y: anytype) @TypeOf(x, y) { - return x / y; -} -pub inline fn mul(x: anytype, y: anytype) @TypeOf(x, y) { - return x * y; -} -pub inline fn negate(x: anytype) @TypeOf(x) { - return -x; -} - -///////////////////////////////////////////////// -// REGEX // -///////////////////////////////////////////////// - -const RegexQuantifier = union(enum) { - any: void, // * - one_or_more: void, // + - optional: void, // ? - exact: usize, // {n} - between: struct { start: usize, stop: usize }, // {i,j} -}; - -const RegexEscaped = struct { - escaped: bool, - char: u8, -}; - -const RegexCharacter = struct { - in_square: bool, // are we a regex char set? - escaped: bool, - negated: bool, - char: u8, -}; - -const RegexSymbol = union(enum) { - s: RegexCharacter, - q: RegexQuantifier, -}; - -fn isRegexFilter(symbol: RegexEscaped) bool { - return symbol.escaped and switch (symbol.char) { - 'w', 'W', 's', 'S', 'd', 'D', '.' => true, - else => false, - }; -} - -fn isRegexQuantifier(symbol: RegexEscaped) bool { - return !symbol.escaped and switch (symbol.char) { - '+', '?', '*', '{' => true, - else => false, - }; -} - -fn isRegexBracket(symbol: RegexCharacter) bool { - return !symbol.escaped and switch (symbol.char) { - '(', ')', '[', ']' => true, - else => false, - }; -} - -fn bracketSet(comptime symbol: RegexCharacter) []const u8 { - const head: u8 = if (symbol.char == '(') '(' else '['; - const tail: u8 = if (symbol.char == '(') ')' else ']'; - return &.{ head, tail }; -} - -fn parseQuantity(comptime escaped: []const RegexEscaped) usize { - comptime var count: usize = 0; - comptime var coefficient: usize = 1; - comptime var i: usize = escaped.len; - inline while (i > 0) { - i -= 1; - - if (comptime !std.ascii.isDigit(escaped[i].char)) { - @compileError("parseQuantity: invalid char"); - } - if (comptime i == 0 and escaped[i].char == '0' and escaped.len > 1) { - @compileError("parseQuantity: head zero in integer"); - } - - const value = escaped[i].char - '0'; - count += value * coefficient; - coefficient *= 10; - } - return count; -} - -fn fuseEscapes( - comptime str: []const u8, -) []const RegexEscaped { - - // TODO: - // consider making this return a direct - // array instead a slice - we don't need - // to keep it around for runtime - - if (comptime str.len == 0) { - @compileError("fuseEscapes: cannot parse empty string"); - } - - // the symbol stack to return - comptime var es: [str.len]RegexEscaped = undefined; - - // track if last char was escape - '\' - comptime var escaped: bool = false; - - // current symbol index - comptime var idx: usize = 0; - - // TODO check for invalid escape character - //'w', 'W', 's', 'S', 'd', 'D', '.', '(', ')', '[', ']' - // '+', '?', '*', '{', '-' - for (str) |char| { - if (char == '\\' and !escaped) { - escaped = true; - continue; - } - - es[idx] = .{ .escaped = escaped, .char = char }; - - escaped = false; - - idx += 1; - } - - if (comptime escaped) { - @compileError("fuseEscapes: unused escape symbol"); - } - - return es[0..idx]; // don't reference at runtime -} - -fn fuseQuantifiers( - comptime es: []const RegexEscaped, -) []const RegexSymbol { - comptime { - @setEvalBranchQuota(200_000); - if (isRegexQuantifier(es[0])) { - @compileError("fuseQuantifiers: 0th symbol cannot be a quanitifier"); - } - - // the symbol stack to return - var sq: [es.len]RegexSymbol = undefined; - - // check if we are within a [] clause - var in_square: bool = false; - var square_head: usize = 0; - var square_tail: usize = 0; - var negated: bool = false; - - // current symbol index - var i: usize = 0; - var j: usize = 0; - var last_quantifier: bool = false; - - // i gets incremented at loop end - while (j < es.len) : (j += 1) { - - // implements set syntax: [abc] -> a, b, or c - if (es[j].char == '[' and !es[j].escaped and !in_square) { - square_head = j; - square_tail = closingBracketEscaped(es, "[]", j); - in_square = true; - } - - // remove set-level negation and keep indicated escapes - if (es[j].char == ']' and !es[j].escaped and in_square and j == square_tail) { - in_square = false; - negated = false; - } - - // implements negated set syntax: [^abc] -> not a, b, or c - if (es[j].char == '^' and in_square and (j -| 1) == square_head and !es[j].escaped) { - negated = true; - continue; - } - - if (!isRegexQuantifier(es[j]) or in_square) { - last_quantifier = false; - - // every bracket within an [] clause is escaped - const override_escape: bool = in_square and switch (es[j].char) { - '(', ')', '[', ']', '{', '}', '.', '^', '$' => (j != square_head and j != square_tail), - else => false, - }; - - sq[i] = .{ - .s = .{ - // we don't want square brackets to be within themselves... - .in_square = in_square and j != square_head and j != square_tail, - .escaped = es[j].escaped or override_escape, - .negated = negated and in_square, - .char = es[j].char, - }, - }; - } else { - if (last_quantifier) { - @compileError("fuseQuantifiers: invalid quantifier"); - } - - last_quantifier = true; - - switch (es[j].char) { - '+' => { - sq[i] = .{ .q = .{ .one_or_more = void{} } }; - }, - '*' => { - sq[i] = .{ .q = .{ .any = void{} } }; - }, - '?' => { - sq[i] = .{ .q = .{ .optional = void{} } }; - }, - '{' => { - // scan forward, find closing brace, parse digits - - j += 1; // move off opening brace - - const range_i = j; - var range_j = j; - var comma = j; - - scan: while (range_j < es.len) : (range_j += 1) { - switch (es[range_j].char) { - '}' => break :scan, - ',' => { - comma = range_j; - continue; - }, - '0'...'9' => continue, - else => @compileError("fuseQuantifiers: invalid char in range"), - } - } else { - @compileError("fuseQuantifiers: unmatched '}' char"); - } - - if (es[range_j].escaped) { - @compileError("fuseQuantifiers: invalid char in range"); - } - - // {i,j} - if (range_i < comma) { - const start: usize = parseQuantity(es[range_i..comma]); - const stop: usize = parseQuantity(es[comma + 1 .. range_j]); - - if (start >= stop) { - @compileError("fuseQuantifiers: invalid range"); - } - - sq[i] = .{ .q = .{ .between = .{ .start = start, .stop = stop } } }; - } else { - const count: usize = parseQuantity(es[range_i..range_j]); - - if (count == 0) { - @compileError("fuseQuantifiers: exact quantifier cannot be 0"); - } - - sq[i] = .{ .q = .{ .exact = count } }; - } - - j = range_j; - }, - else => {}, - } - } - - // this is all the way down here because - // certain charactes can be skipped. - i += 1; - } - - const freeze = sq; - - return freeze[0..i]; // don't reference at runtime - } -} - -fn closingBracket( - comptime sq: []const RegexSymbol, - comptime braces: []const u8, - comptime idx: usize, -) usize { - comptime var count: isize = @intFromBool(sq[idx].s.char == braces[0] and !sq[idx].s.escaped); - - if (comptime count == 0) { - @compileError("closingBracket: must start on opening brace"); - } - comptime var i: usize = idx + 1; - while (i < sq.len) : (i += 1) { - switch (sq[i]) { - .s => |s| { - count += @intFromBool(s.char == braces[0] and !s.escaped); - count -= @intFromBool(s.char == braces[1] and !s.escaped); - if (count == 0) return i; - }, - else => continue, - } - } - @compileError("closingBracket: no closing brace found"); -} - -fn closingBracketEscaped( - comptime es: []const RegexEscaped, - comptime braces: []const u8, - comptime idx: usize, -) usize { - comptime var count: isize = @intFromBool(es[idx].char == braces[0] and !es[idx].escaped); - - if (comptime count == 0) { - @compileError("closingBracket: must start on opening brace"); - } - comptime var i: usize = idx + 1; - while (i < es.len) : (i += 1) { - count += @intFromBool(es[i].char == braces[0] and !es[i].escaped); - count -= @intFromBool(es[i].char == braces[1] and !es[i].escaped); - if (count == 0) return i; - } - @compileError("closingBracket: no closing brace found"); -} - -fn pipeSearch( - comptime sq: []const RegexSymbol, - comptime idx: usize, -) usize { - comptime var i: usize = idx; - while (i < sq.len) : (i += 1) { - switch (sq[i]) { - .s => |s| switch (s.char) { - '|' => if (s.escaped) continue else return i, - '(' => if (s.escaped) continue else { - i = closingBracket(sq, "()", i); - }, - '[' => if (s.escaped) continue else { - i = closingBracket(sq, "[]", i); - }, - '{' => if (s.escaped) continue else { - i = closingBracket(sq, "{}", i); - }, - ')', ']', '}' => if (!s.escaped) @compileError("pipeSearch: invalid braces"), - else => continue, - }, - else => continue, - } - } - return i; -} - -fn RegexOR( - // used for "|" or [abc] clauses - comptime lhs: type, - comptime rhs: type, -) type { - return struct { - pub fn minMatches() usize { - if (comptime @hasDecl(rhs, "minMatches")) { - return @min(lhs.minMatches(), rhs.minMatches()); - } else { - return lhs.minMatches(); - } - } - - pub fn call(str: []const u8, i: usize, prev: bool) ?usize { - if (comptime @hasDecl(rhs, "call")) { - return lhs.call(str, i, prev) orelse rhs.call(str, i, prev); - } else { - return lhs.call(str, i, prev); - } - } - }; -} - -// implents [] syntax - optimization over OR branches... -fn RegexCharset(comptime symbols: []const RegexSymbol) type { - return struct { - const Self = @This(); - - fn SetImpl( - comptime char_len: usize, - comptime span_len: usize, - comptime func_len: usize, - ) type { - return struct { - char_set: [char_len]u8, - span_set: [span_len]u8, - func_set: [func_len]u8, - }; - } - - // to handle character-spans (ex: [a-z]), we first check - // to see if we have any spans in our character set. If - // we do not, we do a vectorized check across the whole - // set. If we do have character spans, we move those - // characters to their own list and make separate checks - // for the spans. - - // memoize char array for easy access - const impl = blk: { - var char_len: usize = 0; - var span_len: usize = 0; - var func_len: usize = 0; - var char_set: [symbols.len]u8 = undefined; - var span_set: [symbols.len]u8 = undefined; - var func_set: [symbols.len]u8 = undefined; - - var i: usize = 0; - - while (i < symbols.len) { - if (isCharFunction(symbols[i].s.char) and symbols[i].s.escaped) { - func_set[func_len] = symbols[i].s.char; - func_len += 1; - i += 1; - continue; - } - - const j = i + 1; - const k = j + 1; - - if (j < symbols.len and k < symbols.len and symbols[j].s.char == '-' and !symbols[j].s.escaped) { - if (symbols[i].s.char >= symbols[k].s.char) - @panic("Left side of char span must be less than right side: " ++ &[_]u8{ symbols[i].s.char, '-', symbols[k].s.char }); - - span_set[span_len + 0] = symbols[i].s.char; - span_set[span_len + 1] = symbols[k].s.char; - span_len += 2; - i += 3; - continue; - } - - char_set[char_len] = symbols[i].s.char; - char_len += 1; - i += 1; - } - - break :blk Self.SetImpl(char_len, span_len, func_len){ - .char_set = char_set[0..char_len].*, - .span_set = span_set[0..span_len].*, - .func_set = func_set[0..func_len].*, - }; - }; - - // the entire character set is negated as a set - const negated = symbols[0].s.negated; - - fn checkFunc(str: []const u8, i: usize) bool { - if (comptime Self.impl.func_set.len == 0) - return false; - - // negation must be handled as a group - do not pass - // the negation flag to the char function. - - inline for (0..Self.impl.func_set.len) |f| { - if (charFunction(Self.impl.func_set[f], false, str, i) != null) return true; - } - return false; - } - - fn checkChar(str: []const u8, i: usize) bool { - if (comptime Self.impl.char_set.len == 0) - return false; - - return std.mem.indexOfScalar(u8, Self.impl.char_set[0..], str[i]) != null; - } - - fn checkSpan(str: []const u8, i: usize) bool { - if (comptime Self.impl.span_set.len == 0) - return false; - - const c = str[i]; - - var n: usize = 0; - - while (n < Self.impl.span_set.len) : (n += 2) { - if (Self.impl.span_set[n] <= c and c <= Self.impl.span_set[n + 1]) - return true; - } - return false; - } - - pub fn call(str: []const u8, i: usize, _: bool) ?usize { - if (i == str.len) return null; - - // Character sets in PCRE do not respect zero-length - // matches. It looks like they always increment by 1. - // This means that things [\b]w+ will not match like - // \b\w+ like one would expect. - - if (comptime !Self.negated) { - if (checkChar(str, i)) return i + 1; - if (checkSpan(str, i)) return i + 1; - if (checkFunc(str, i)) return i + 1; - return null; - } else { - const b = checkChar(str, i) or - checkSpan(str, i) or - checkFunc(str, i); - - return if (!b) i + 1 else null; - } - } - }; -} - -fn RegexAND( - comptime lhs: type, - comptime rhs: type, -) type { - return struct { - pub fn minMatches() usize { - const matches: usize = blk: { - const q = lhs.quantifier orelse break :blk 1; - break :blk switch (q) { - .any => 0, - .exact => |n| n, - .between => |b| b.start, - .one_or_more => 1, - .optional => 0, - }; - }; - if (comptime @hasDecl(rhs, "minMatches")) { - return @max(matches + rhs.minMatches(), 1); - } else { - return @max(matches, 1); - } - } - - pub fn call(str: []const u8, i: usize, prev: bool) ?usize { - - // NOTE: - // any time an index had add assignment, - // use call(str[i..], 0) to only add the - // next N matches. Otherwise, always pass - // ass call(str, i) to accumulate. - - if (comptime !@hasDecl(rhs, "call")) { - if (comptime lhs.quantifier) |q| { - switch (q) { - .any => { - var idx: usize = i; - while (idx < str.len) { - idx = lhs.call(str, idx, prev) orelse break; - } - return if (prev or idx != i) idx else null; - }, - .exact => |n| { - var idx: usize = i; - for (0..n) |_| { - idx = lhs.call(str, idx, prev) orelse return null; - } - return idx; - }, - .between => |b| { - var idx: usize = i; - var count: usize = 0; - while (count < b.start and idx < str.len) : (count += 1) { - idx = lhs.call(str, idx, prev) orelse return null; - } - // idx < str.len can break above loop early - if (count < b.start) return null; - - // check if new match has occured - const new_match = (i != idx) or prev; - - while (count < b.stop and idx < str.len) : (count += 1) { - idx = lhs.call(str, idx, new_match) orelse break; - } - // idx could have advanced - check again - return if (prev or idx != i) idx else null; - }, - .one_or_more => { - var idx = lhs.call(str, i, prev) orelse return null; - - while (idx < str.len) { - idx = lhs.call(str, idx, true) orelse break; - } - return idx; - }, - .optional => { - return lhs.call(str, i, prev) orelse if (prev) i else null; - }, - } - } else { - return lhs.call(str, i, prev); - } - } - - if (comptime lhs.quantifier) |q| { - switch (q) { - .any => { - var idx: usize = i; - var last: ?usize = null; - - while (idx < str.len) { - last = rhs.call(str, idx, false) orelse last; - idx = lhs.call(str, idx, prev) orelse break; - } - return rhs.call(str, idx, i != idx) orelse last; - }, - .exact => |n| { - var idx: usize = i; - for (0..n) |_| { - idx = lhs.call(str, idx, prev) orelse return null; - } - return rhs.call(str, idx, true); - }, - .between => |b| { - var idx: usize = i; - var count: usize = 0; - while (count < b.start and idx < str.len) : (count += 1) { - idx = lhs.call(str, idx, prev) orelse return null; - } - // idx < str.len can break above loop early - if (count < b.start) return null; - - // check if new match has occured - const new_match = (i != idx) or prev; - - var last: ?usize = null; - while (count < b.stop and idx < str.len) : (count += 1) { - last = rhs.call(str, idx, new_match) orelse last; - idx = lhs.call(str, idx, new_match) orelse break; - } - // idx could have advanced - check again - return rhs.call(str, idx, (i != idx) or prev) orelse last; - }, - .one_or_more => { - var idx: usize = lhs.call(str, i, prev) orelse return null; - - var last: ?usize = null; - while (idx < str.len) { - // at least one match above has occured - last = rhs.call(str, idx, true) orelse last; - idx = lhs.call(str, idx, true) orelse break; - } - // at least one match above has occured - return rhs.call(str, idx, true) orelse last; - }, - .optional => { - // a match hasn't occurred so we defer to previous - const j = lhs.call(str, i, prev) orelse return rhs.call(str, i, prev); - // a match must have occured so we switch to true - return rhs.call(str, j, true) orelse rhs.call(str, i, prev); - }, - } - } else { - // a match hasn't occurred so we defer to previous - const j = lhs.call(str, i, prev) orelse return null; - // a match must have occured so we switch to true - return rhs.call(str, j, true); - } - } - }; -} - -fn RegexLookAhead( - // only used for (?=) and (?!) type clauses, - // should only appear in those contextes - comptime this: type, - comptime positive: bool, -) type { - return struct { - pub fn minMatches() usize { - return 0; // only precedes or follows a match - } - pub inline fn call(str: []const u8, i: usize, prev: bool) ?usize { - if (comptime @hasDecl(this, "call")) { - if (comptime positive) { - return if (this.call(str, i, prev)) |_| i else null; - } else { - return if (this.call(str, i, prev)) |_| null else i; - } - } else { - // case of empty lookahead - return if (comptime positive) i else null; - } - } - }; -} - -fn RegexUnit( - comptime Callable: anytype, - comptime Quantifier: ?RegexQuantifier, -) type { - return struct { - pub const callable = Callable; - pub const quantifier = Quantifier; - pub const info = @typeInfo(@TypeOf(callable)); - pub inline fn call(str: []const u8, i: usize, prev: bool) ?usize { - if (comptime info == .@"fn") { - // terminal function call.. - return callable(str, i); - } else { - // another parsing tree... - return callable.call(str, i, prev); - } - } - }; -} - -////////////////////////////////////////////////// -// Character Matching Functions // -////////////////////////////////////////////////// - -// TODO: consider moving into the charFunction call while parsing -fn equalRegex(comptime char: u8) fn ([]const u8, i: usize) ?usize { - return struct { - pub fn call(str: []const u8, i: usize) ?usize { - return if (i < str.len and str[i] == char) i + 1 else null; - } - }.call; -} - -// TODO: consider moving into the charFunction call while parsing -fn startsWithRegex(str: []const u8, i: usize) ?usize { - return if (str.len > 0 and i == 0) i else null; -} - -// TODO: consider moving into the charFunction call while parsing -fn endsWithRegex(str: []const u8, i: usize) ?usize { - return if (str.len > 0 and i == str.len) i else null; -} - -// TODO: consider moving into the charFunction call while parsing -fn anyRegex(_: []const u8, i: usize) ?usize { - return i + 1; -} - -fn isWordCharacter(c: u8) bool { - return (std.ascii.isAlphanumeric(c) or c == '_'); -} - -fn isVerticalWhitespace(c: u8) bool { - return switch (c) { - '\n', '\x85', std.ascii.control_code.cr, std.ascii.control_code.vt, std.ascii.control_code.ff => true, - else => false, - }; -} - -fn isHorizontalWhitespace(c: u8) bool { - return switch (c) { - ' ', '\t' => true, - else => false, - }; -} - -fn isWordBoundary(str: []const u8, i: usize) bool { - if (i == str.len) - return isWordCharacter(str[i - 1]); - - if (i == 0 and isWordCharacter(str[i])) - return true; - - if ((i + 1) == str.len and isWordCharacter(str[i])) - return true; - - // character, check boundary behind - if (isWordCharacter(str[i]) and !isWordCharacter(str[i - 1])) - return true; - - // character, check boundary behind - if (!isWordCharacter(str[i]) and isWordCharacter(str[i - 1])) - return true; - - return false; -} - -pub fn isZeroLength(comptime c: u8) bool { - return switch (c) { - 'b', 'B' => true, - else => false, - }; -} - -fn isCharFunction(comptime char: u8) bool { - return switch (char) { - 'w', 'W', 'd', 'D', 's', 'S', 'h', 'H', 'v', 'V', 'b', 'B' => true, - else => false, - }; -} - -fn charFunction( - comptime char: u8, - comptime negated: bool, - str: []const u8, - i: usize, -) ?usize { - const c = comptime if (negated) negateChar(char) else char; - - return blk: { - if (comptime isZeroLength(char)) { - - // Zero-length matches can have i == str.len - // and always return i as their match, hence - // "zero-length" match. - - const b: bool = switch (comptime c) { - 'b' => isWordBoundary(str, i), - 'B' => !isWordBoundary(str, i), - else => @compileError("Invalid character"), - }; - - break :blk if (b) i else null; - } else { - - // Standard matches expect i < str.len - // and advance i by 1. - - if (i == str.len) - return null; - - const b: bool = switch (comptime c) { - 'w' => isWordCharacter(str[i]), - 'W' => !isWordCharacter(str[i]), - 'd' => std.ascii.isDigit(str[i]), - 'D' => !std.ascii.isDigit(str[i]), - 's' => std.ascii.isWhitespace(str[i]), - 'S' => !std.ascii.isWhitespace(str[i]), - 'h' => isHorizontalWhitespace(str[i]), - 'H' => !isHorizontalWhitespace(str[i]), - 'v' => isVerticalWhitespace(str[i]), - 'V' => !isVerticalWhitespace(str[i]), - else => @compileError("Invalid character"), - }; - - break :blk if (b) i + 1 else null; - } - }; -} - -pub fn BindCharFunction( - comptime char: u8, - comptime negated: bool, -) fn ([]const u8, usize) callconv(.Inline) ?usize { - return struct { - pub inline fn call(str: []const u8, i: usize) ?usize { - return charFunction(char, negated, str, i); - } - }.call; -} - -pub fn negateChar(comptime c: u8) u8 { - if (std.ascii.isLower(c)) { - return std.ascii.toUpper(c); - } - return std.ascii.toLower(c); -} - -//////////////////////////////////////////// -// Tree Parsing Functions // -//////////////////////////////////////////// - -fn ParseRegexTreeBreadth(comptime sq: []const RegexSymbol) type { - comptime { - if (sq.len == 0) - return struct {}; // terminal node - - const pipe: usize = pipeSearch(sq, 0); - - if (pipe < sq.len) { - return RegexOR( - ParseRegexTreeBreadth(sq[0..pipe]), - ParseRegexTreeBreadth(sq[pipe + 1 ..]), - ); - } else { - return ParseRegexTreeDepth(sq); - } - } -} - -fn ParseRegexTreeDepth(comptime sq: []const RegexSymbol) type { - comptime { - if (sq.len == 0) - return struct {}; // terminal node - - var _sq = sq; // shrinking list - - // deduce function - const Node: type = switch (_sq[0]) { - .s => |s| outer: { - if (isRegexBracket(s)) { - // this branch deduces an entire sub-automaton - var closing = closingBracket(sq, bracketSet(s), 0); - - // For parsing any thing between brackets, make sure - // to put the code within this 'sub' block so the parser - // can continue beyond the end of the brackets. Otherwise, - // this will result in a segfault. - - const T: type = sub: { - if (s.char == '[') { - break :sub RegexCharset(sq[1..closing]); - } - - if (closing > 2 and s.char == '(') { - if (_sq[1] != .q or _sq[2] != .s) { - break :sub ParseRegexTreeBreadth(sq[1..closing]); - } - const t = _sq[1].q; - const u = _sq[2].s; - - if (t == .optional) { - if (u.char == '=' and !u.escaped) { // (?= - break :sub RegexLookAhead(ParseRegexTreeBreadth(sq[3..closing]), true); - } - if (u.char == '!' and !u.escaped) { // (?! - break :sub RegexLookAhead(ParseRegexTreeBreadth(sq[3..closing]), false); - } - } - } - - // parse everything between the brackets - break :sub ParseRegexTreeBreadth(sq[1..closing]); - }; - - // the entire automaton can be quantified - const q: ?RegexQuantifier = - if (closing + 1 >= _sq.len) null else switch (_sq[closing + 1]) { - .q => |q| inner: { - closing += 1; - break :inner q; - }, - .s => null, - }; - - _sq = _sq[closing + 1 ..]; - - break :outer RegexUnit(T, q); - } - - _sq = _sq[1..]; // pop token - - const q: ?RegexQuantifier = - if (0 == _sq.len) null else switch (_sq[0]) { - .q => |q| inner: { - _sq = _sq[1..]; // pop token - break :inner q; - }, - .s => null, - }; - - if (s.escaped and isCharFunction(s.char)) { - break :outer RegexUnit(BindCharFunction(s.char, s.negated), q); - } else { - if (!s.escaped) { - switch (s.char) { - '.' => break :outer RegexUnit(anyRegex, q), - '^' => { - if (q != null) @compileError("Symbol '^' cannot have a quantifier."); - break :outer RegexUnit(startsWithRegex, null); - }, - '$' => { - if (q != null) @compileError("Symbol '$' cannot have a quantifier."); - break :outer RegexUnit(endsWithRegex, null); - }, - else => {}, - } - } - } - - // default to direct equals - break :outer RegexUnit(equalRegex(s.char), q); - }, - .q => @compileError("ParseRegexTreeRecursive: head quantifier"), - }; - - return RegexAND(Node, ParseRegexTreeDepth(_sq)); - } -} - -fn ParseRegexTree( - comptime expression: []const u8, -) type { - return comptime ParseRegexTreeBreadth(fuseQuantifiers(fuseEscapes(expression))); -} - //////////////////////////////////////////////////////////////////////////////// // TESTING BLOCK : /// //////////////////////////////////////////////////////////////////////////////// diff --git a/src/fluent_enum.zig b/src/fluent_enum.zig new file mode 100644 index 0000000..c9e7260 --- /dev/null +++ b/src/fluent_enum.zig @@ -0,0 +1,39 @@ +const std = @import("std"); + +pub const StringMode = enum { + regex, + scalar, +}; + +pub const DirectionOption = enum { + all, + left, + right, +}; + +pub const ReplaceOption = enum { + first, + last, + all, + periphery, +}; + +pub const TrimOptions = enum { + scalar, + predicate, + any, + regex, +}; + +pub const SortDirection = enum { + asc, + desc, +}; + +pub const SampleOption = enum { + scalar, + sequence, +}; + +// any, sequence, scalar +pub const FluentMode = std.mem.DelimiterType; diff --git a/src/fluent_helpers.zig b/src/fluent_helpers.zig new file mode 100644 index 0000000..eee7749 --- /dev/null +++ b/src/fluent_helpers.zig @@ -0,0 +1,182 @@ +const std = @import("std"); +const Child = std.meta.Child; +const Order = std.math.Order; +const ReduceOp = std.builtin.ReduceOp; +const math = std.math; + +//////////////////////////////////////////////////////////////////////////////// +// PRIVATE HELPERS : // +//////////////////////////////////////////////////////////////////////////////// + +pub fn isConst(comptime T: type) bool { + switch (@typeInfo(T)) { + .pointer => |ptr| return ptr.is_const, + else => @compileError("Type must coercible to a slice."), + } +} + +pub fn isSlice(comptime T: type) bool { + return switch (@typeInfo(T)) { + .pointer => |ptr| ptr.size == .Slice, + else => false, + }; +} + +pub fn isInteger(comptime T: type) bool { + return switch (@typeInfo(T)) { + .int, .comptime_int => true, + else => false, + }; +} + +pub fn isUnsigned(comptime T: type) bool { + return switch (@typeInfo(T)) { + .int => |i| i.signedness == .unsigned, + else => false, + }; +} + +pub fn tupleSize(comptime tuple: anytype) usize { + return switch (@typeInfo(@TypeOf(tuple))) { + .@"struct" => |s| s.fields.len, + else => @compileError("type must be a tuple"), + }; +} + +pub fn default(comptime T: type) T { + if (comptime T == bool) { + return true; + } + return 0; +} + +// bypasses iterator transform +pub inline fn identity(x: anytype) @TypeOf(x) { + return x; +} + +pub fn isFloat(comptime T: type) bool { + return switch (@typeInfo(T)) { + .float => true, + else => false, + }; +} + +pub fn Parameter(comptime T: type, comptime mode: anytype) type { + const param_types = std.StaticStringMap(type).initComptime(.{ + .{ "any", []const T }, + .{ "scalar", T }, + .{ "sequence", []const T }, + .{ "range", struct { start: usize, end: usize } }, + .{ "predicate", fn (T) bool }, + .{ "regex", []const u8 }, + }); + return comptime param_types.get(@tagName(mode)) orelse unreachable; +} + +// checks if we are pointing to an array +pub fn DeepChild(comptime T: type) type { + // TODO: consider comptime support, should be Immutable only.. + + const C = Child(T); + + return switch (@typeInfo(C)) { + .int, .float => C, + .array => |a| a.child, + else => @compileError("Unsupported Type"), + }; +} + +pub inline fn wrapIndex(len: usize, idx: anytype) usize { + switch (@typeInfo(@TypeOf(idx))) { + .int => |i| { + if (comptime i.signedness == .unsigned) { + return idx; + } else { + const u: usize = @abs(idx); + return if (idx < 0) len - u else u; + } + }, + .comptime_int => { + const u: usize = comptime @abs(idx); + return if (comptime idx < 0) len - u else u; + }, + else => @compileError("Index must be an integer type parameter."), + } +} + +pub inline fn reduceInit(comptime op: ReduceOp, comptime T: type) T { + const info = @typeInfo(T); + + return switch (op) { + .Add => 0, // implicit cast + .Mul => 1, // implicit cast + .Min => if (comptime info == .int) + math.maxInt(T) + else + math.inf(T), + .Max => if (comptime info == .int) + math.minInt(T) + else + -math.inf(T), + else => @compileError("reduceInit: unsupported op"), + }; +} + +pub fn simdReduce( + comptime T: type, + comptime ReduceType: anytype, + comptime BinaryFunc: anytype, + items: []const T, + initial: T, +) T { + // Special thanks to the user "nyc" over at the Ziggit forum + + var ptr: [*c]const T = @ptrCast(items.ptr); + const end: [*c]const T = ptr + items.len; + + var rdx: T = blk: { + if (comptime std.simd.suggestVectorLength(T)) |N| { + if (items.len < N) + break :blk initial; + + var vec_rdx: @Vector(N, T) = @splat(initial); + + while (ptr + N <= end) : (ptr += N) { + vec_rdx = @call(.always_inline, BinaryFunc, .{ vec_rdx, @as(*const @Vector(N, T), @ptrCast(@alignCast(ptr))).* }); + } + break :blk @reduce(ReduceType, vec_rdx); + } else { + break :blk initial; + } + }; + + while (ptr < end) : (ptr += 1) { + rdx = @call(.always_inline, BinaryFunc, .{ rdx, ptr.* }); + } + + return rdx; +} + +// these work for @Vector as well as scalar types +pub inline fn max(x: anytype, y: anytype) @TypeOf(x, y) { + return @max(x, y); +} +pub inline fn min(x: anytype, y: anytype) @TypeOf(x, y) { + return @min(x, y); +} +pub inline fn add(x: anytype, y: anytype) @TypeOf(x, y) { + return x + y; +} +pub inline fn sub(x: anytype, y: anytype) @TypeOf(x, y) { + return x - y; +} +pub inline fn div(x: anytype, y: anytype) @TypeOf(x, y) { + return x / y; +} +pub inline fn mul(x: anytype, y: anytype) @TypeOf(x, y) { + return x * y; +} +pub inline fn negate(x: anytype) @TypeOf(x) { + return -x; +} diff --git a/src/fluent_iterator.zig b/src/fluent_iterator.zig new file mode 100644 index 0000000..94bdc33 --- /dev/null +++ b/src/fluent_iterator.zig @@ -0,0 +1,352 @@ +const std = @import("std"); +const Child = std.meta.Child; +const Order = std.math.Order; +const ReduceOp = std.builtin.ReduceOp; +const math = std.math; + +//////////////////////////////////////////////////////////////////////////////// +// FLUENT IMPORT /// +//////////////////////////////////////////////////////////////////////////////// +const flt = @import("fluent.zig"); + +const Fluent = flt.Fluent; +const FluentInterface = flt.FluentInterface; + +//////////////////////////////////////////////////////////////////////////////// +// HELPERS IMPORT /// +//////////////////////////////////////////////////////////////////////////////// + +const flth = @import("fluent_helpers.zig"); + +const DeepChild = flth.DeepChild; +const isConst = flth.isConst; +const identity = flth.identity; +const isSlice = flth.isSlice; +const Parameter = flth.Parameter; +const default = flth.default; +const tupleSize = flth.tupleSize; +const wrapIndex = flth.wrapIndex; +const isInteger = flth.isInteger; +const isUnsigned = flth.isUnsigned; +const isFloat = flth.isFloat; +const simdReduce = flth.simdReduce; +const reduceInit = flth.reduceInit; +const add = flth.add; +const mul = flth.mul; + +//////////////////////////////////////////////////////////////////////////////// +// REGEX IMPORT /// +//////////////////////////////////////////////////////////////////////////////// + +const fltregx = @import("fluent_regex.zig"); +const ParseRegexTree = fltregx.ParseRegexTree; + +/// enum {forward, reverse} +pub const IteratorMode = enum { forward, reverse }; + +pub fn BaseIterator(comptime T: type, mode: IteratorMode) type { + return IteratorInterface(T, mode, void{}, identity); +} + +pub fn iterator( + comptime mode: IteratorMode, + items: anytype, +) BaseIterator(DeepChild(@TypeOf(items)), mode) { + const T = DeepChild(@TypeOf(items)); + + if (comptime !isSlice(@TypeOf(items))) { + return iterator(mode, @as([]const T, items)); + } + + const P = [*c]const T; + + const ptr: P = if (comptime mode == .forward) + @as(P, @ptrCast(items.ptr)) + else + (@as(P, @ptrCast(items.ptr)) + items.len) - 1; + + const end: P = if (comptime mode == .forward) + @as(P, @ptrCast(items.ptr)) + items.len + else + @as(P, @ptrCast(items.ptr)) - 1; + + return .{ + .ptr = ptr, + .end = end, + .stride = 1, + }; +} + +pub fn MatchIterator( + comptime expression: []const u8, +) type { + return struct { + const Self = @This(); + const tree = ParseRegexTree(expression); + items: []const u8, + index: usize, + + pub fn init(items: []const u8) Self { + return .{ .items = items, .index = 0 }; + } + + pub fn next(self: *Self) ?FluentInterface([]const u8) { + while (self.index < self.items.len) : (self.index += 1) { + if (tree.call(self.items, self.index, false)) |last| { + + // non-advancing calls + if (self.index == last) + continue; + + defer self.index = last; + return Fluent.init(self.items[self.index..last]); + } + } + return null; + } + + pub fn span(self: *Self) ?struct { pos: usize, end: usize } { + while (self.index < self.items.len) : (self.index += 1) { + if (tree.call(self.items, self.index, false)) |last| { + + // non-advancing calls + if (self.index == last) + continue; + + defer self.index = last; + return .{ .pos = self.index, .end = last }; + } + } + return null; + } + }; +} + +/// match - match substrings based on an expression +pub fn match( + comptime expression: []const u8, + source: []const u8, +) MatchIterator(expression) { + return MatchIterator(expression).init(source); +} + +pub fn SplitIterator(comptime expression: []const u8) type { + return struct { + const Self = @This(); + const tree = ParseRegexTree(expression); + items: []const u8, + index: ?usize, + + pub fn init(items: []const u8) Self { + return .{ .items = items, .index = 0 }; + } + + pub fn next(self: *Self) ?FluentInterface([]const u8) { + const start = self.index orelse return null; + var stop: usize = start; + const end: ?usize = blk: { + while (stop < self.items.len) : (stop += 1) { + const last = tree.call(self.items, stop, false) orelse continue; + + // non-advancing calls + if (start == last) + continue; + + break :blk last; + } else break :blk null; + }; + defer self.index = end; + return Fluent.init(self.items[start..stop]); + } + + pub fn span(self: *Self) struct { pos: usize, end: usize } { + const start = self.index orelse return null; + var stop: usize = start; + const end: ?usize = blk: { + while (stop < self.items.len) : (stop += 1) { + const last = tree.call(self.items, stop, false) orelse continue; + + // non-advancing calls + if (start == last) + continue; + + break :blk last; + } else break :blk null; + }; + defer self.index = end; + return .{ .pos = start, .end = stop }; + } + }; +} + +/// split - splits a string based on a delimiting expression +pub fn split( + comptime expression: []const u8, + source: []const u8, +) SplitIterator(expression) { + return SplitIterator(expression).init(source); +} + +pub fn IteratorInterface( + comptime DataType: type, + mode: IteratorMode, + comptime filters: anytype, // tuple or function + comptime transforms: anytype, // tuple or function +) type { + return struct { + const Self = @This(); + const Mode = mode; + + ptr: [*c]const DataType, + end: [*c]const DataType, + stride: usize, + + pub fn next(self: *Self) ?DataType { + if (comptime @TypeOf(filters) != void) { + // apply single filter or tuple of filters + switch (comptime @typeInfo(@TypeOf(filters))) { + .@"fn" => { + if (comptime Mode == .forward) { + while (self.ptr < self.end and !filters(self.ptr.*)) + self.ptr += self.stride; + } else { + while (self.ptr > self.end and !filters(self.ptr.*)) + self.ptr -= self.stride; + } + }, + else => outer: { // applies inline filters + if (comptime Mode == .forward) { + inner: while (self.ptr < self.end) : (self.ptr += self.stride) { + inline for (filters) |f| { + if (!f(self.ptr.*)) continue :inner; + } + break :outer; + } + } else { + inner: while (self.ptr > self.end) : (self.ptr -= self.stride) { + inline for (filters) |f| { + if (!f(self.ptr.*)) continue :inner; + } + break :outer; + } + } + }, + } + } + + // unpack transforms into single transform call + const transform = comptime if (@typeInfo(@TypeOf(transforms)) == .@"fn") + transforms + else + Fluent.Chain(transforms).call; + + switch (comptime Mode) { + .forward => { + if (self.ptr < self.end) { + defer self.ptr += self.stride; + return @call(.always_inline, transform, .{self.ptr.*}); + } + }, + .reverse => { + if (self.ptr > self.end) { + defer self.ptr -= self.stride; + return @call(.always_inline, transform, .{self.ptr.*}); + } + }, + } + return null; + } + + /// strided - set iterator stride (default 1) + pub fn strided( + self: Self, + stride_size: usize, + ) Self { + return .{ + .ptr = self.ptr, + .end = self.end, + .stride = stride_size, + }; + } + + /// window - return a slice and advance by stride + pub fn window( + self: *Self, + window_size: usize, + ) ?FluentInterface([]const DataType) { + switch (comptime Mode) { + .forward => { + if (self.ptr + window_size <= self.end) { + defer _ = self.next(); + return Fluent.init(self.ptr[0..window_size]); + } + }, + .reverse => { + if ((self.ptr + 1) - window_size > self.end) { + defer _ = self.next(); + return Fluent.init(((self.ptr + 1) - window_size)[0..window_size]); + } + }, + } + return null; + } + + /// map - transforms every elment in the acquired slice with a given unary function + pub fn map( + self: Self, + comptime new_transforms: anytype, + ) IteratorInterface(DataType, Mode, filters, new_transforms) { + return .{ + .ptr = self.ptr, + .end = self.end, + .stride = self.stride, + }; + } + + /// filter - acquire a unary predicate or a tuple of unary predicates + pub fn filter( + self: Self, + comptime new_filters: anytype, + ) IteratorInterface(DataType, Mode, new_filters, transforms) { + return .{ + .ptr = self.ptr, + .end = self.end, + .stride = self.stride, + }; + } + + pub fn write( + self: anytype, // for both const and non-const pointers + items: []DataType, + ) usize { + // enable chaining without temporaries + if (comptime isConst(@TypeOf(self))) { + var tmp = self.*; + return tmp.write(items); + } + var count: usize = 0; + while (count < items.len) : (count += 1) { + items[count] = self.next() orelse return count; + } + return count; + } + + pub fn reduce( + self: anytype, // for both const and non-const pointers + comptime T: type, + comptime binary_func: anytype, // single binary function + initial: T, + ) T { + // enable chaining without temporaries + if (comptime isConst(@TypeOf(self))) { + var tmp = self.*; + return tmp.reduce(T, binary_func, initial); + } + var rdx = initial; + while (self.next()) |x| { + rdx = @call(.always_inline, binary_func, .{ rdx, x }); + } + return rdx; + } + }; +} diff --git a/src/fluent_regex.zig b/src/fluent_regex.zig new file mode 100644 index 0000000..9e10d6c --- /dev/null +++ b/src/fluent_regex.zig @@ -0,0 +1,990 @@ +const std = @import("std"); +const Child = std.meta.Child; +const Order = std.math.Order; +const ReduceOp = std.builtin.ReduceOp; +const math = std.math; + +//////////////////////////////////////////////////////////////////////////////// +// HELPERS IMPORT /// +//////////////////////////////////////////////////////////////////////////////// + +const flth = @import("fluent_helpers.zig"); + +const DeepChild = flth.DeepChild; +const isConst = flth.isConst; +const identity = flth.identity; +const isSlice = flth.isSlice; +const Parameter = flth.Parameter; +const default = flth.default; +const tupleSize = flth.tupleSize; +const wrapIndex = flth.wrapIndex; +const isInteger = flth.isInteger; +const isUnsigned = flth.isUnsigned; +const isFloat = flth.isFloat; +const simdReduce = flth.simdReduce; +const reduceInit = flth.reduceInit; +const add = flth.add; +const mul = flth.mul; + +///////////////////////////////////////////////// +// REGEX // +///////////////////////////////////////////////// + +pub const RegexQuantifier = union(enum) { + any: void, // * + one_or_more: void, // + + optional: void, // ? + exact: usize, // {n} + between: struct { start: usize, stop: usize }, // {i,j} +}; + +pub const RegexEscaped = struct { + escaped: bool, + char: u8, +}; + +pub const RegexCharacter = struct { + in_square: bool, // are we a regex char set? + escaped: bool, + negated: bool, + char: u8, +}; + +pub const RegexSymbol = union(enum) { + s: RegexCharacter, + q: RegexQuantifier, +}; + +pub fn isRegexFilter(symbol: RegexEscaped) bool { + return symbol.escaped and switch (symbol.char) { + 'w', 'W', 's', 'S', 'd', 'D', '.' => true, + else => false, + }; +} + +pub fn isRegexQuantifier(symbol: RegexEscaped) bool { + return !symbol.escaped and switch (symbol.char) { + '+', '?', '*', '{' => true, + else => false, + }; +} + +pub fn isRegexBracket(symbol: RegexCharacter) bool { + return !symbol.escaped and switch (symbol.char) { + '(', ')', '[', ']' => true, + else => false, + }; +} + +pub fn bracketSet(comptime symbol: RegexCharacter) []const u8 { + const head: u8 = if (symbol.char == '(') '(' else '['; + const tail: u8 = if (symbol.char == '(') ')' else ']'; + return &.{ head, tail }; +} + +pub fn parseQuantity(comptime escaped: []const RegexEscaped) usize { + comptime var count: usize = 0; + comptime var coefficient: usize = 1; + comptime var i: usize = escaped.len; + inline while (i > 0) { + i -= 1; + + if (comptime !std.ascii.isDigit(escaped[i].char)) { + @compileError("parseQuantity: invalid char"); + } + if (comptime i == 0 and escaped[i].char == '0' and escaped.len > 1) { + @compileError("parseQuantity: head zero in integer"); + } + + const value = escaped[i].char - '0'; + count += value * coefficient; + coefficient *= 10; + } + return count; +} + +pub fn fuseEscapes( + comptime str: []const u8, +) []const RegexEscaped { + + // TODO: + // consider making this return a direct + // array instead a slice - we don't need + // to keep it around for runtime + + if (comptime str.len == 0) { + @compileError("fuseEscapes: cannot parse empty string"); + } + + // the symbol stack to return + comptime var es: [str.len]RegexEscaped = undefined; + + // track if last char was escape - '\' + comptime var escaped: bool = false; + + // current symbol index + comptime var idx: usize = 0; + + // TODO check for invalid escape character + //'w', 'W', 's', 'S', 'd', 'D', '.', '(', ')', '[', ']' + // '+', '?', '*', '{', '-' + for (str) |char| { + if (char == '\\' and !escaped) { + escaped = true; + continue; + } + + es[idx] = .{ .escaped = escaped, .char = char }; + + escaped = false; + + idx += 1; + } + + if (comptime escaped) { + @compileError("fuseEscapes: unused escape symbol"); + } + + return es[0..idx]; // don't reference at runtime +} + +pub fn fuseQuantifiers( + comptime es: []const RegexEscaped, +) []const RegexSymbol { + comptime { + @setEvalBranchQuota(200_000); + if (isRegexQuantifier(es[0])) { + @compileError("fuseQuantifiers: 0th symbol cannot be a quanitifier"); + } + + // the symbol stack to return + var sq: [es.len]RegexSymbol = undefined; + + // check if we are within a [] clause + var in_square: bool = false; + var square_head: usize = 0; + var square_tail: usize = 0; + var negated: bool = false; + + // current symbol index + var i: usize = 0; + var j: usize = 0; + var last_quantifier: bool = false; + + // i gets incremented at loop end + while (j < es.len) : (j += 1) { + + // implements set syntax: [abc] -> a, b, or c + if (es[j].char == '[' and !es[j].escaped and !in_square) { + square_head = j; + square_tail = closingBracketEscaped(es, "[]", j); + in_square = true; + } + + // remove set-level negation and keep indicated escapes + if (es[j].char == ']' and !es[j].escaped and in_square and j == square_tail) { + in_square = false; + negated = false; + } + + // implements negated set syntax: [^abc] -> not a, b, or c + if (es[j].char == '^' and in_square and (j -| 1) == square_head and !es[j].escaped) { + negated = true; + continue; + } + + if (!isRegexQuantifier(es[j]) or in_square) { + last_quantifier = false; + + // every bracket within an [] clause is escaped + const override_escape: bool = in_square and switch (es[j].char) { + '(', ')', '[', ']', '{', '}', '.', '^', '$' => (j != square_head and j != square_tail), + else => false, + }; + + sq[i] = .{ + .s = .{ + // we don't want square brackets to be within themselves... + .in_square = in_square and j != square_head and j != square_tail, + .escaped = es[j].escaped or override_escape, + .negated = negated and in_square, + .char = es[j].char, + }, + }; + } else { + if (last_quantifier) { + @compileError("fuseQuantifiers: invalid quantifier"); + } + + last_quantifier = true; + + switch (es[j].char) { + '+' => { + sq[i] = .{ .q = .{ .one_or_more = void{} } }; + }, + '*' => { + sq[i] = .{ .q = .{ .any = void{} } }; + }, + '?' => { + sq[i] = .{ .q = .{ .optional = void{} } }; + }, + '{' => { + // scan forward, find closing brace, parse digits + + j += 1; // move off opening brace + + const range_i = j; + var range_j = j; + var comma = j; + + scan: while (range_j < es.len) : (range_j += 1) { + switch (es[range_j].char) { + '}' => break :scan, + ',' => { + comma = range_j; + continue; + }, + '0'...'9' => continue, + else => @compileError("fuseQuantifiers: invalid char in range"), + } + } else { + @compileError("fuseQuantifiers: unmatched '}' char"); + } + + if (es[range_j].escaped) { + @compileError("fuseQuantifiers: invalid char in range"); + } + + // {i,j} + if (range_i < comma) { + const start: usize = parseQuantity(es[range_i..comma]); + const stop: usize = parseQuantity(es[comma + 1 .. range_j]); + + if (start >= stop) { + @compileError("fuseQuantifiers: invalid range"); + } + + sq[i] = .{ .q = .{ .between = .{ .start = start, .stop = stop } } }; + } else { + const count: usize = parseQuantity(es[range_i..range_j]); + + if (count == 0) { + @compileError("fuseQuantifiers: exact quantifier cannot be 0"); + } + + sq[i] = .{ .q = .{ .exact = count } }; + } + + j = range_j; + }, + else => {}, + } + } + + // this is all the way down here because + // certain charactes can be skipped. + i += 1; + } + + const freeze = sq; + + return freeze[0..i]; // don't reference at runtime + } +} + +pub fn closingBracket( + comptime sq: []const RegexSymbol, + comptime braces: []const u8, + comptime idx: usize, +) usize { + comptime var count: isize = @intFromBool(sq[idx].s.char == braces[0] and !sq[idx].s.escaped); + + if (comptime count == 0) { + @compileError("closingBracket: must start on opening brace"); + } + comptime var i: usize = idx + 1; + while (i < sq.len) : (i += 1) { + switch (sq[i]) { + .s => |s| { + count += @intFromBool(s.char == braces[0] and !s.escaped); + count -= @intFromBool(s.char == braces[1] and !s.escaped); + if (count == 0) return i; + }, + else => continue, + } + } + @compileError("closingBracket: no closing brace found"); +} + +pub fn closingBracketEscaped( + comptime es: []const RegexEscaped, + comptime braces: []const u8, + comptime idx: usize, +) usize { + comptime var count: isize = @intFromBool(es[idx].char == braces[0] and !es[idx].escaped); + + if (comptime count == 0) { + @compileError("closingBracket: must start on opening brace"); + } + comptime var i: usize = idx + 1; + while (i < es.len) : (i += 1) { + count += @intFromBool(es[i].char == braces[0] and !es[i].escaped); + count -= @intFromBool(es[i].char == braces[1] and !es[i].escaped); + if (count == 0) return i; + } + @compileError("closingBracket: no closing brace found"); +} + +pub fn pipeSearch( + comptime sq: []const RegexSymbol, + comptime idx: usize, +) usize { + comptime var i: usize = idx; + while (i < sq.len) : (i += 1) { + switch (sq[i]) { + .s => |s| switch (s.char) { + '|' => if (s.escaped) continue else return i, + '(' => if (s.escaped) continue else { + i = closingBracket(sq, "()", i); + }, + '[' => if (s.escaped) continue else { + i = closingBracket(sq, "[]", i); + }, + '{' => if (s.escaped) continue else { + i = closingBracket(sq, "{}", i); + }, + ')', ']', '}' => if (!s.escaped) @compileError("pipeSearch: invalid braces"), + else => continue, + }, + else => continue, + } + } + return i; +} + +pub fn RegexOR( + // used for "|" or [abc] clauses + comptime lhs: type, + comptime rhs: type, +) type { + return struct { + pub fn minMatches() usize { + if (comptime @hasDecl(rhs, "minMatches")) { + return @min(lhs.minMatches(), rhs.minMatches()); + } else { + return lhs.minMatches(); + } + } + + pub fn call(str: []const u8, i: usize, prev: bool) ?usize { + if (comptime @hasDecl(rhs, "call")) { + return lhs.call(str, i, prev) orelse rhs.call(str, i, prev); + } else { + return lhs.call(str, i, prev); + } + } + }; +} + +// implents [] syntax - optimization over OR branches... +pub fn RegexCharset(comptime symbols: []const RegexSymbol) type { + return struct { + const Self = @This(); + + fn SetImpl( + comptime char_len: usize, + comptime span_len: usize, + comptime func_len: usize, + ) type { + return struct { + char_set: [char_len]u8, + span_set: [span_len]u8, + func_set: [func_len]u8, + }; + } + + // to handle character-spans (ex: [a-z]), we first check + // to see if we have any spans in our character set. If + // we do not, we do a vectorized check across the whole + // set. If we do have character spans, we move those + // characters to their own list and make separate checks + // for the spans. + + // memoize char array for easy access + const impl = blk: { + var char_len: usize = 0; + var span_len: usize = 0; + var func_len: usize = 0; + var char_set: [symbols.len]u8 = undefined; + var span_set: [symbols.len]u8 = undefined; + var func_set: [symbols.len]u8 = undefined; + + var i: usize = 0; + + while (i < symbols.len) { + if (isCharFunction(symbols[i].s.char) and symbols[i].s.escaped) { + func_set[func_len] = symbols[i].s.char; + func_len += 1; + i += 1; + continue; + } + + const j = i + 1; + const k = j + 1; + + if (j < symbols.len and k < symbols.len and symbols[j].s.char == '-' and !symbols[j].s.escaped) { + if (symbols[i].s.char >= symbols[k].s.char) + @panic("Left side of char span must be less than right side: " ++ &[_]u8{ symbols[i].s.char, '-', symbols[k].s.char }); + + span_set[span_len + 0] = symbols[i].s.char; + span_set[span_len + 1] = symbols[k].s.char; + span_len += 2; + i += 3; + continue; + } + + char_set[char_len] = symbols[i].s.char; + char_len += 1; + i += 1; + } + + break :blk Self.SetImpl(char_len, span_len, func_len){ + .char_set = char_set[0..char_len].*, + .span_set = span_set[0..span_len].*, + .func_set = func_set[0..func_len].*, + }; + }; + + // the entire character set is negated as a set + const negated = symbols[0].s.negated; + + fn checkFunc(str: []const u8, i: usize) bool { + if (comptime Self.impl.func_set.len == 0) + return false; + + // negation must be handled as a group - do not pass + // the negation flag to the char function. + + inline for (0..Self.impl.func_set.len) |f| { + if (charFunction(Self.impl.func_set[f], false, str, i) != null) return true; + } + return false; + } + + fn checkChar(str: []const u8, i: usize) bool { + if (comptime Self.impl.char_set.len == 0) + return false; + + return std.mem.indexOfScalar(u8, Self.impl.char_set[0..], str[i]) != null; + } + + fn checkSpan(str: []const u8, i: usize) bool { + if (comptime Self.impl.span_set.len == 0) + return false; + + const c = str[i]; + + var n: usize = 0; + + while (n < Self.impl.span_set.len) : (n += 2) { + if (Self.impl.span_set[n] <= c and c <= Self.impl.span_set[n + 1]) + return true; + } + return false; + } + + pub fn call(str: []const u8, i: usize, _: bool) ?usize { + if (i == str.len) return null; + + // Character sets in PCRE do not respect zero-length + // matches. It looks like they always increment by 1. + // This means that things [\b]w+ will not match like + // \b\w+ like one would expect. + + if (comptime !Self.negated) { + if (checkChar(str, i)) return i + 1; + if (checkSpan(str, i)) return i + 1; + if (checkFunc(str, i)) return i + 1; + return null; + } else { + const b = checkChar(str, i) or + checkSpan(str, i) or + checkFunc(str, i); + + return if (!b) i + 1 else null; + } + } + }; +} + +pub fn RegexAND( + comptime lhs: type, + comptime rhs: type, +) type { + return struct { + pub fn minMatches() usize { + const matches: usize = blk: { + const q = lhs.quantifier orelse break :blk 1; + break :blk switch (q) { + .any => 0, + .exact => |n| n, + .between => |b| b.start, + .one_or_more => 1, + .optional => 0, + }; + }; + if (comptime @hasDecl(rhs, "minMatches")) { + return @max(matches + rhs.minMatches(), 1); + } else { + return @max(matches, 1); + } + } + + pub fn call(str: []const u8, i: usize, prev: bool) ?usize { + + // NOTE: + // any time an index had add assignment, + // use call(str[i..], 0) to only add the + // next N matches. Otherwise, always pass + // ass call(str, i) to accumulate. + + if (comptime !@hasDecl(rhs, "call")) { + if (comptime lhs.quantifier) |q| { + switch (q) { + .any => { + var idx: usize = i; + while (idx < str.len) { + idx = lhs.call(str, idx, prev) orelse break; + } + return if (prev or idx != i) idx else null; + }, + .exact => |n| { + var idx: usize = i; + for (0..n) |_| { + idx = lhs.call(str, idx, prev) orelse return null; + } + return idx; + }, + .between => |b| { + var idx: usize = i; + var count: usize = 0; + while (count < b.start and idx < str.len) : (count += 1) { + idx = lhs.call(str, idx, prev) orelse return null; + } + // idx < str.len can break above loop early + if (count < b.start) return null; + + // check if new match has occured + const new_match = (i != idx) or prev; + + while (count < b.stop and idx < str.len) : (count += 1) { + idx = lhs.call(str, idx, new_match) orelse break; + } + // idx could have advanced - check again + return if (prev or idx != i) idx else null; + }, + .one_or_more => { + var idx = lhs.call(str, i, prev) orelse return null; + + while (idx < str.len) { + idx = lhs.call(str, idx, true) orelse break; + } + return idx; + }, + .optional => { + return lhs.call(str, i, prev) orelse if (prev) i else null; + }, + } + } else { + return lhs.call(str, i, prev); + } + } + + if (comptime lhs.quantifier) |q| { + switch (q) { + .any => { + var idx: usize = i; + var last: ?usize = null; + + while (idx < str.len) { + last = rhs.call(str, idx, false) orelse last; + idx = lhs.call(str, idx, prev) orelse break; + } + return rhs.call(str, idx, i != idx) orelse last; + }, + .exact => |n| { + var idx: usize = i; + for (0..n) |_| { + idx = lhs.call(str, idx, prev) orelse return null; + } + return rhs.call(str, idx, true); + }, + .between => |b| { + var idx: usize = i; + var count: usize = 0; + while (count < b.start and idx < str.len) : (count += 1) { + idx = lhs.call(str, idx, prev) orelse return null; + } + // idx < str.len can break above loop early + if (count < b.start) return null; + + // check if new match has occured + const new_match = (i != idx) or prev; + + var last: ?usize = null; + while (count < b.stop and idx < str.len) : (count += 1) { + last = rhs.call(str, idx, new_match) orelse last; + idx = lhs.call(str, idx, new_match) orelse break; + } + // idx could have advanced - check again + return rhs.call(str, idx, (i != idx) or prev) orelse last; + }, + .one_or_more => { + var idx: usize = lhs.call(str, i, prev) orelse return null; + + var last: ?usize = null; + while (idx < str.len) { + // at least one match above has occured + last = rhs.call(str, idx, true) orelse last; + idx = lhs.call(str, idx, true) orelse break; + } + // at least one match above has occured + return rhs.call(str, idx, true) orelse last; + }, + .optional => { + // a match hasn't occurred so we defer to previous + const j = lhs.call(str, i, prev) orelse return rhs.call(str, i, prev); + // a match must have occured so we switch to true + return rhs.call(str, j, true) orelse rhs.call(str, i, prev); + }, + } + } else { + // a match hasn't occurred so we defer to previous + const j = lhs.call(str, i, prev) orelse return null; + // a match must have occured so we switch to true + return rhs.call(str, j, true); + } + } + }; +} + +pub fn RegexLookAhead( + // only used for (?=) and (?!) type clauses, + // should only appear in those contextes + comptime this: type, + comptime positive: bool, +) type { + return struct { + pub fn minMatches() usize { + return 0; // only precedes or follows a match + } + pub inline fn call(str: []const u8, i: usize, prev: bool) ?usize { + if (comptime @hasDecl(this, "call")) { + if (comptime positive) { + return if (this.call(str, i, prev)) |_| i else null; + } else { + return if (this.call(str, i, prev)) |_| null else i; + } + } else { + // case of empty lookahead + return if (comptime positive) i else null; + } + } + }; +} + +pub fn RegexUnit( + comptime Callable: anytype, + comptime Quantifier: ?RegexQuantifier, +) type { + return struct { + pub const callable = Callable; + pub const quantifier = Quantifier; + pub const info = @typeInfo(@TypeOf(callable)); + pub inline fn call(str: []const u8, i: usize, prev: bool) ?usize { + if (comptime info == .@"fn") { + // terminal function call.. + return callable(str, i); + } else { + // another parsing tree... + return callable.call(str, i, prev); + } + } + }; +} + +////////////////////////////////////////////////// +// Character Matching Functions // +////////////////////////////////////////////////// + +// TODO: consider moving into the charFunction call while parsing +pub fn equalRegex(comptime char: u8) fn ([]const u8, i: usize) ?usize { + return struct { + pub fn call(str: []const u8, i: usize) ?usize { + return if (i < str.len and str[i] == char) i + 1 else null; + } + }.call; +} + +// TODO: consider moving into the charFunction call while parsing +pub fn startsWithRegex(str: []const u8, i: usize) ?usize { + return if (str.len > 0 and i == 0) i else null; +} + +// TODO: consider moving into the charFunction call while parsing +pub fn endsWithRegex(str: []const u8, i: usize) ?usize { + return if (str.len > 0 and i == str.len) i else null; +} + +// TODO: consider moving into the charFunction call while parsing +pub fn anyRegex(_: []const u8, i: usize) ?usize { + return i + 1; +} + +pub fn isWordCharacter(c: u8) bool { + return (std.ascii.isAlphanumeric(c) or c == '_'); +} + +pub fn isVerticalWhitespace(c: u8) bool { + return switch (c) { + '\n', '\x85', std.ascii.control_code.cr, std.ascii.control_code.vt, std.ascii.control_code.ff => true, + else => false, + }; +} + +pub fn isHorizontalWhitespace(c: u8) bool { + return switch (c) { + ' ', '\t' => true, + else => false, + }; +} + +pub fn isWordBoundary(str: []const u8, i: usize) bool { + if (i == str.len) + return isWordCharacter(str[i - 1]); + + if (i == 0 and isWordCharacter(str[i])) + return true; + + if ((i + 1) == str.len and isWordCharacter(str[i])) + return true; + + // character, check boundary behind + if (isWordCharacter(str[i]) and !isWordCharacter(str[i - 1])) + return true; + + // character, check boundary behind + if (!isWordCharacter(str[i]) and isWordCharacter(str[i - 1])) + return true; + + return false; +} + +pub fn isZeroLength(comptime c: u8) bool { + return switch (c) { + 'b', 'B' => true, + else => false, + }; +} + +pub fn isCharFunction(comptime char: u8) bool { + return switch (char) { + 'w', 'W', 'd', 'D', 's', 'S', 'h', 'H', 'v', 'V', 'b', 'B' => true, + else => false, + }; +} + +pub fn charFunction( + comptime char: u8, + comptime negated: bool, + str: []const u8, + i: usize, +) ?usize { + const c = comptime if (negated) negateChar(char) else char; + + return blk: { + if (comptime isZeroLength(char)) { + + // Zero-length matches can have i == str.len + // and always return i as their match, hence + // "zero-length" match. + + const b: bool = switch (comptime c) { + 'b' => isWordBoundary(str, i), + 'B' => !isWordBoundary(str, i), + else => @compileError("Invalid character"), + }; + + break :blk if (b) i else null; + } else { + + // Standard matches expect i < str.len + // and advance i by 1. + + if (i == str.len) + return null; + + const b: bool = switch (comptime c) { + 'w' => isWordCharacter(str[i]), + 'W' => !isWordCharacter(str[i]), + 'd' => std.ascii.isDigit(str[i]), + 'D' => !std.ascii.isDigit(str[i]), + 's' => std.ascii.isWhitespace(str[i]), + 'S' => !std.ascii.isWhitespace(str[i]), + 'h' => isHorizontalWhitespace(str[i]), + 'H' => !isHorizontalWhitespace(str[i]), + 'v' => isVerticalWhitespace(str[i]), + 'V' => !isVerticalWhitespace(str[i]), + else => @compileError("Invalid character"), + }; + + break :blk if (b) i + 1 else null; + } + }; +} + +pub fn BindCharFunction( + comptime char: u8, + comptime negated: bool, +) fn ([]const u8, usize) callconv(.Inline) ?usize { + return struct { + pub inline fn call(str: []const u8, i: usize) ?usize { + return charFunction(char, negated, str, i); + } + }.call; +} + +pub fn negateChar(comptime c: u8) u8 { + if (std.ascii.isLower(c)) { + return std.ascii.toUpper(c); + } + return std.ascii.toLower(c); +} + +//////////////////////////////////////////// +// Tree Parsing Functions // +//////////////////////////////////////////// + +pub fn ParseRegexTreeBreadth(comptime sq: []const RegexSymbol) type { + comptime { + if (sq.len == 0) + return struct {}; // terminal node + + const pipe: usize = pipeSearch(sq, 0); + + if (pipe < sq.len) { + return RegexOR( + ParseRegexTreeBreadth(sq[0..pipe]), + ParseRegexTreeBreadth(sq[pipe + 1 ..]), + ); + } else { + return ParseRegexTreeDepth(sq); + } + } +} + +pub fn ParseRegexTreeDepth(comptime sq: []const RegexSymbol) type { + comptime { + if (sq.len == 0) + return struct {}; // terminal node + + var _sq = sq; // shrinking list + + // deduce function + const Node: type = switch (_sq[0]) { + .s => |s| outer: { + if (isRegexBracket(s)) { + // this branch deduces an entire sub-automaton + var closing = closingBracket(sq, bracketSet(s), 0); + + // For parsing any thing between brackets, make sure + // to put the code within this 'sub' block so the parser + // can continue beyond the end of the brackets. Otherwise, + // this will result in a segfault. + + const T: type = sub: { + if (s.char == '[') { + break :sub RegexCharset(sq[1..closing]); + } + + if (closing > 2 and s.char == '(') { + if (_sq[1] != .q or _sq[2] != .s) { + break :sub ParseRegexTreeBreadth(sq[1..closing]); + } + const t = _sq[1].q; + const u = _sq[2].s; + + if (t == .optional) { + if (u.char == '=' and !u.escaped) { // (?= + break :sub RegexLookAhead(ParseRegexTreeBreadth(sq[3..closing]), true); + } + if (u.char == '!' and !u.escaped) { // (?! + break :sub RegexLookAhead(ParseRegexTreeBreadth(sq[3..closing]), false); + } + } + } + + // parse everything between the brackets + break :sub ParseRegexTreeBreadth(sq[1..closing]); + }; + + // the entire automaton can be quantified + const q: ?RegexQuantifier = + if (closing + 1 >= _sq.len) null else switch (_sq[closing + 1]) { + .q => |q| inner: { + closing += 1; + break :inner q; + }, + .s => null, + }; + + _sq = _sq[closing + 1 ..]; + + break :outer RegexUnit(T, q); + } + + _sq = _sq[1..]; // pop token + + const q: ?RegexQuantifier = + if (0 == _sq.len) null else switch (_sq[0]) { + .q => |q| inner: { + _sq = _sq[1..]; // pop token + break :inner q; + }, + .s => null, + }; + + if (s.escaped and isCharFunction(s.char)) { + break :outer RegexUnit(BindCharFunction(s.char, s.negated), q); + } else { + if (!s.escaped) { + switch (s.char) { + '.' => break :outer RegexUnit(anyRegex, q), + '^' => { + if (q != null) @compileError("Symbol '^' cannot have a quantifier."); + break :outer RegexUnit(startsWithRegex, null); + }, + '$' => { + if (q != null) @compileError("Symbol '$' cannot have a quantifier."); + break :outer RegexUnit(endsWithRegex, null); + }, + else => {}, + } + } + } + + // default to direct equals + break :outer RegexUnit(equalRegex(s.char), q); + }, + .q => @compileError("ParseRegexTreeRecursive: head quantifier"), + }; + + return RegexAND(Node, ParseRegexTreeDepth(_sq)); + } +} + +pub fn ParseRegexTree( + comptime expression: []const u8, +) type { + return comptime ParseRegexTreeBreadth(fuseQuantifiers(fuseEscapes(expression))); +} diff --git a/src/fluent_static_bitset.zig b/src/fluent_static_bitset.zig new file mode 100644 index 0000000..4f387bf --- /dev/null +++ b/src/fluent_static_bitset.zig @@ -0,0 +1,89 @@ +const std = @import("std"); + +pub const StringBitSet = struct { + const BackingSet = std.StaticBitSet(@bitSizeOf(usize)); + + bits: [4]BackingSet, + + /// init - returns an initEmpty instance of StringBitSet + pub fn init() StringBitSet { + return .{ .bits = .{ + BackingSet.initEmpty(), + BackingSet.initEmpty(), + BackingSet.initEmpty(), + BackingSet.initEmpty(), + } }; + } + + /// setValue - sets the value of the bit at the specified position + pub fn setValue(self: *StringBitSet, pos: usize, value: bool) void { + const mod_pos = pos & 63; + switch (pos) { + 0...63 => self.bits[0].setValue(mod_pos, value), + 64...127 => self.bits[1].setValue(mod_pos, value), + 128...191 => self.bits[2].setValue(mod_pos, value), + 192...255 => self.bits[3].setValue(mod_pos, value), + else => unreachable, + } + } + + /// isSet - checks if the bit at the specified position is set + pub fn isSet(self: *const StringBitSet, pos: usize) bool { + const mod_pos = pos & 63; + return switch (pos) { + 0...63 => self.bits[0].isSet(mod_pos), + 64...127 => self.bits[1].isSet(mod_pos), + 128...191 => self.bits[2].isSet(mod_pos), + 192...255 => self.bits[3].isSet(mod_pos), + else => unreachable, + }; + } + + /// unionWith - computes the union of two StringBitSets + pub fn unionWith(self: StringBitSet, other: StringBitSet) StringBitSet { + return .{ .bits = .{ + self.bits[0].unionWith(other.bits[0]), + self.bits[1].unionWith(other.bits[1]), + self.bits[2].unionWith(other.bits[2]), + self.bits[3].unionWith(other.bits[3]), + } }; + } + + /// differenceWith - computes the difference of two StringBitSets + pub fn differenceWith(self: StringBitSet, other: StringBitSet) StringBitSet { + return .{ .bits = .{ + self.bits[0].differenceWith(other.bits[0]), + self.bits[1].differenceWith(other.bits[1]), + self.bits[2].differenceWith(other.bits[2]), + self.bits[3].differenceWith(other.bits[3]), + } }; + } + + /// intersectWith - computes the intersection of two StringBitSets + pub fn intersectWith(self: StringBitSet, other: StringBitSet) StringBitSet { + return .{ .bits = .{ + self.bits[0].intersectWith(other.bits[0]), + self.bits[1].intersectWith(other.bits[1]), + self.bits[2].intersectWith(other.bits[2]), + self.bits[3].intersectWith(other.bits[3]), + } }; + } + + /// count - counts the number of set bits in the StringBitSet + pub fn count(self: StringBitSet) usize { + return self.bits[0].count() + self.bits[1].count() + self.bits[2].count() + self.bits[3].count(); + } + + /// fillBuffer - fills a buffer with the values of set bits in the StringBitSet + pub fn fillBuffer(self: *const StringBitSet, buffer: []u8) []u8 { + var val: usize = 0; + var pos: usize = 0; + while (val < 256) : (val += 1) { + if (self.isSet(val)) { + buffer[pos] = @intCast(val); + pos += 1; + } + } + return buffer[0..pos]; + } +}; diff --git a/src/fluent_unary_fn_adapter.zig b/src/fluent_unary_fn_adapter.zig new file mode 100644 index 0000000..1bb1e2e --- /dev/null +++ b/src/fluent_unary_fn_adapter.zig @@ -0,0 +1,113 @@ +const std = @import("std"); +const Child = std.meta.Child; +const Order = std.math.Order; +const ReduceOp = std.builtin.ReduceOp; +const math = std.math; + +//////////////////////////////////////////////////////////////////////////////// +// HELPERS IMPORT /// +//////////////////////////////////////////////////////////////////////////////// + +const flth = @import("fluent_helpers.zig"); + +const DeepChild = flth.DeepChild; +const isConst = flth.isConst; +const identity = flth.identity; +const isSlice = flth.isSlice; +const Parameter = flth.Parameter; +const default = flth.default; +const tupleSize = flth.tupleSize; +const wrapIndex = flth.wrapIndex; +const isInteger = flth.isInteger; +const isUnsigned = flth.isUnsigned; +const isFloat = flth.isFloat; +const simdReduce = flth.simdReduce; +const reduceInit = flth.reduceInit; +const add = flth.add; +const mul = flth.mul; + +//////////////////////////////////////////////////////////////////////////////// +// UNARY FUNCTION ADAPTER : // +//////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////// +// chain: combine multiple unary functions into a single in-order call + +pub fn Chain( + comptime unary_tuple: anytype, +) type { + return struct { + pub fn call(x: anytype) @TypeOf(@call(.auto, unwrap, .{ 0, unary_tuple, default(@TypeOf(x)) })) { + return @call(.always_inline, unwrap, .{ 0, unary_tuple, x }); + } + }; +} + +pub fn unwrap( + comptime pos: usize, + comptime unary_tuple: anytype, + arg: anytype, +) if (pos < tupleSize(unary_tuple)) + @TypeOf(unary_tuple[pos](default(@TypeOf(arg)))) +else + @TypeOf(arg) { + // this is a forward-unwrap that passes + // outcomes of one function to the next + if (comptime pos == tupleSize(unary_tuple)) { + return arg; + } + return @call(.always_inline, unwrap, .{ (pos + 1), unary_tuple, @call(.always_inline, unary_tuple[pos], .{arg}) }); +} + +////////////////////////////////////////////////////////////////////// +// bind: affix comptime arguments to the front of a function + +pub fn bind( + comptime bind_tuple: anytype, + comptime function: anytype, +) BindRetun(bind_tuple, function) { + const bind_count = comptime tupleSize(bind_tuple); + const total_count = comptime @typeInfo(@TypeOf(function)).@"fn".params.len; + + if (comptime total_count - bind_count == 1) { + return struct { + pub fn call(x: anytype) @TypeOf(x) { + return @call(.always_inline, function, bind_tuple ++ .{x}); + } + }.call; + } else { + return struct { + pub fn call(x: anytype, y: anytype) @TypeOf(x) { + return @call(.always_inline, function, bind_tuple ++ .{ x, y }); + } + }.call; + } +} + +pub fn BindRetun( + comptime bind_tuple: anytype, + comptime function: anytype, +) type { + const total_count = comptime @typeInfo(@TypeOf(function)).@"fn".params.len; + const bind_count = comptime tupleSize(bind_tuple); + + if (comptime total_count < bind_count) + @compileError("too many arguments to bind"); + + if (comptime total_count - bind_count > 2) + @compileError("fluent bind must result in unary or binary function"); + + const choices = struct { + pub fn unary(x: anytype) @TypeOf(x) { + return x; + } + pub fn binary(x: anytype, y: anytype) @TypeOf(x) { + _ = &y; + return x; + } + }; + return if (comptime (total_count - bind_count) == 1) + @TypeOf(choices.unary) + else + @TypeOf(choices.binary); +}