diff --git a/contrib/code-file-concat/.gitignore b/contrib/code-file-concat/.gitignore new file mode 100644 index 00000000..b9b71011 --- /dev/null +++ b/contrib/code-file-concat/.gitignore @@ -0,0 +1 @@ +data/output diff --git a/contrib/code-file-concat/Cargo.lock b/contrib/code-file-concat/Cargo.lock new file mode 100644 index 00000000..23500fbc --- /dev/null +++ b/contrib/code-file-concat/Cargo.lock @@ -0,0 +1,741 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cc" +version = "1.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "code-file-concat" +version = "0.1.0" +dependencies = [ + "clap", + "mj_io", + "rand", + "rayon", + "serde_json", +] + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "flate2" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi", + "windows-targets", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.170" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" + +[[package]] +name = "log" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" +dependencies = [ + "adler2", +] + +[[package]] +name = "mj_io" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d233748539aa5c1d0946ce06d14cc74dcb80b5a419b984d6e3f726cb05b749d" +dependencies = [ + "anyhow", + "flate2", + "glob", + "indicatif", + "zstd", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy 0.7.35", +] + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha", + "rand_core", + "zerocopy 0.8.22", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive 0.7.35", +] + +[[package]] +name = "zerocopy" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09612fda0b63f7cb9e0af7e5916fe5a1f8cdcb066829f10f36883207628a4872" +dependencies = [ + "zerocopy-derive 0.8.22", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79f81d38d7a2ed52d8f034e62c568e111df9bf8aba2f7cf19ddc5bf7bd89d520" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3051792fbdc2e1e143244dc28c60f73d8470e93f3f9cbd0ead44da5ed802722" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.14+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb060d4926e4ac3a3ad15d864e99ceb5f343c6b34f5bd6d81ae6ed417311be5" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/contrib/code-file-concat/Cargo.toml b/contrib/code-file-concat/Cargo.toml new file mode 100644 index 00000000..a6d41793 --- /dev/null +++ b/contrib/code-file-concat/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "code-file-concat" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.29", features = ["derive"] } +mj_io = "0.1.2" +rand = "0.9.0" +rayon = "1.10.0" +serde_json = "1.0.138" diff --git a/contrib/code-file-concat/README.md b/contrib/code-file-concat/README.md new file mode 100644 index 00000000..2f3d4175 --- /dev/null +++ b/contrib/code-file-concat/README.md @@ -0,0 +1,31 @@ +# Code File Concatenation + +A standalone-script used for concatenating code files within +a programming language and repo. Assumes data is pre-partitioned +by repo, and sorted by repo+pl. Each partition contain more than +one repo. + +Simple at present, concatenates with special token as delimiter; +can randomize file order. Extend from here. + +## Usage + +Assumes partitions consist of jsonl strings, one per line. Must contain +a `"text"` field as well as a `"metadata"` field. The latter should contain +subfields that specify programming language and repo name. The field names +are parameterized, with defaults shown in the command below. + +```shell +cargo run --release -- \ + --inputs 'data/input/*jsonl' \ + --output data/output \ + --randomize-order \ + --file-separator-token '<|file_sep|>' + --repo-field-name repo_name + --pl-field-name language +``` + +It will run over all files matching `data/input/*.jsonl`, +writing results to `data/output`. Ordering and partition will be preserved, +with fewer resulting documents in each output partition, per concatenation. + diff --git a/contrib/code-file-concat/src/concat.rs b/contrib/code-file-concat/src/concat.rs new file mode 100644 index 00000000..aacd9456 --- /dev/null +++ b/contrib/code-file-concat/src/concat.rs @@ -0,0 +1,82 @@ +use rand::prelude::*; +use rand::rng; +use std::iter; + +use serde_json::json; + +fn get_metadata_field(document: &serde_json::Value, field_name: &str) -> String { + document + .get("metadata") + .and_then(|m| m.get(field_name)) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string() +} + +pub struct CodeFileConcat<'a> { + pub randomize_order: bool, + pub file_separator_token: &'a str, + pub repo_field_name: &'a str, + pub pl_field_name: &'a str, +} + +impl CodeFileConcat<'_> { + pub fn perform_on_partition<'a, T: Iterator + 'a>( + &'a self, + documents: &'a mut T, + ) -> impl Iterator + 'a { + let mut random = rng(); + let mut maybe_current_group_head: Option = documents.next(); + + iter::from_fn(move || { + let current_group_head = maybe_current_group_head.take()?; + + let current_repo = get_metadata_field(¤t_group_head, &self.repo_field_name); + let current_pl = get_metadata_field(¤t_group_head, &self.pl_field_name); + + let mut repo_texts: Vec = vec![current_group_head + .get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string()]; + + while let Some(next) = documents.next() { + let next_repo = get_metadata_field(&next, &self.repo_field_name); + let next_pl = get_metadata_field(&next, &self.pl_field_name); + if next_repo == current_repo && next_pl == current_pl { + repo_texts.push( + next.get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + ); + } else { + maybe_current_group_head = Some(next); + break; + } + } + + if self.randomize_order { + repo_texts.shuffle(&mut random); + } + + let repo_text = repo_texts.join(&self.file_separator_token); + + // We chose an arbitrary node from the current repo/pl group to + // represent the concatenated document. + // No attempt is made at this stage to coalesce attributes or ids + // or other data from the different group members, but it perhaps + // warrants future consideration. + let mut repo_document = current_group_head.clone(); + + if let Some(obj) = repo_document.as_object_mut() { + obj.insert("text".to_string(), json!(repo_text)); + if let Some(metadata) = obj.get_mut("metadata") { + metadata["files_concatenated"] = json!(repo_texts.len()); + } + } + + Some(repo_document) + }) + } +} diff --git a/contrib/code-file-concat/src/main.rs b/contrib/code-file-concat/src/main.rs new file mode 100644 index 00000000..58151cc1 --- /dev/null +++ b/contrib/code-file-concat/src/main.rs @@ -0,0 +1,200 @@ +use clap::Parser; +use std::io::{BufRead, Error}; +use std::path::PathBuf; + +use mj_io::{build_pbar, expand_dirs, read_pathbuf_to_mem, write_mem_to_pathbuf}; +use rayon::prelude::*; +use serde_json; + +mod concat; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Input files to process + #[arg(short, long, required = true)] + inputs: Vec, + + /// Destination output file + #[arg(short, long, required = true)] + output: PathBuf, + + /// Whether to randomize the order during concatenation + #[arg(long, default_value_t = false)] + randomize_order: bool, + + /// Value of the file separator token + #[arg(long, required = true, default_value = "<|file_sep|>")] + file_separator_token: String, + + /// Which metadata field to find the repo name + #[arg(long, required = true, default_value = "repo_name")] + repo_field_name: String, + + /// Which metadata field to find the programming language + #[arg(long, required = true, default_value = "language")] + pl_field_name: String, +} + +/// Compute the longest common prefix of a non-empty slice of paths. +fn compute_common_prefix(paths: &[PathBuf]) -> PathBuf { + if paths.is_empty() { + return PathBuf::new(); + } + + // Split each path into its components. + let components: Vec> = paths + .iter() + .map(|p| p.components().map(|c| c.as_os_str()).collect()) + .collect(); + + // Find the minimum number of components among all paths. + let min_len = components.iter().map(|comp| comp.len()).min().unwrap_or(0); + + let mut common = PathBuf::new(); + // Iterate index-by-index comparing components. + for i in 0..min_len { + let candidate = components[0][i]; + if components.iter().all(|comp| comp[i] == candidate) { + common.push(candidate); + } else { + break; + } + } + common +} + +/// Given a list of input paths and a destination prefix, +/// returns a vector where each file is the destination prefix plus +/// the input file’s path relative to the shared prefix. +fn map_paths_to_destination(inputs: &Vec, dest_prefix: PathBuf) -> Vec { + if inputs.is_empty() { + return Vec::new(); + } + + if inputs.len() == 1 { + // get filename from single input path + let src_filename = inputs[0].file_name().unwrap(); + let dst_filename = dest_prefix.join(src_filename); + return vec![dst_filename]; + } + + let common_prefix = compute_common_prefix(&inputs); + inputs + .into_iter() + .map(|input| { + // Calculate the relative path from the common prefix. + let relative = input + .strip_prefix(&common_prefix) + .expect("All inputs should share the common prefix"); + // Build the new destination path. + let mut new_path = dest_prefix.clone(); + new_path.push(relative); + new_path + }) + .collect() +} + +fn find_all_paths(inputs: Vec) -> Vec { + let all_paths: Vec = inputs + .into_iter() + .map(|path| { + let manual_ext: Option>; // Store Vec instead of &[&str] + let input_paths: Vec; + + match path.extension() { + Some(ext) => { + let ext_str = ext.to_string_lossy().into_owned(); // Convert to owned String + manual_ext = Some(vec![ext_str]); // Store as Vec + let mut trunk_path = path.clone(); + trunk_path.pop(); + input_paths = vec![trunk_path]; + } + None => { + manual_ext = None; + input_paths = vec![path]; + } + } + + // Convert Vec to Vec<&str> before passing it to expand_dirs + let manual_ext_refs: Option> = manual_ext + .as_ref() + .map(|v| v.iter().map(|s| s.as_str()).collect()); + expand_dirs(input_paths, manual_ext_refs.as_deref()).unwrap_or_default() + }) + .flatten() + .collect(); + return all_paths; +} + +fn mk_serde_doc_reader(reader: R) -> impl Iterator { + reader.lines().map(|line| { + let line = line.unwrap(); + serde_json::from_str(&line).unwrap() + }) +} + +fn process_single( + src_path: &PathBuf, + dst_path: &PathBuf, + randomize_order: bool, + file_separator_token: &str, + repo_field_name: &str, + pl_field_name: &str, +) -> Result<(), Error> { + let concater = concat::CodeFileConcat { + randomize_order, + file_separator_token, + repo_field_name, + pl_field_name, + }; + + println!("Processing {:?} -> {:?}", src_path, dst_path); + let src_buf = read_pathbuf_to_mem(src_path).unwrap(); + let mut out_bytes: Vec = Vec::new(); + let newline: u8 = b'\n'; + + let mut file_documents = mk_serde_doc_reader(src_buf); + let repo_documents = concater.perform_on_partition(&mut file_documents); + + for repo_document in repo_documents { + out_bytes.extend_from_slice(&serde_json::to_vec(&repo_document)?); + out_bytes.push(newline) + } + + write_mem_to_pathbuf(&out_bytes, dst_path).unwrap(); + Ok(()) +} + +fn main() { + // parse command line arguments + let args: Args = Args::parse(); + + // for each prefix, we derive both + let all_src: Vec = find_all_paths(args.inputs); + + println!("Found {} paths to process", all_src.len()); + + let all_dst = map_paths_to_destination(&all_src, args.output.clone()); + + let pbar = build_pbar(all_src.len(), "Processing files"); + + // here we can use rayon to parallelize the mapping operation + all_src + .into_iter() + .zip(all_dst.into_iter()) + .collect::>() + .par_iter() + .for_each(|(src_path, dst_path)| { + process_single( + src_path, + dst_path, + args.randomize_order, + &args.file_separator_token, + &args.repo_field_name, + &args.pl_field_name, + ) + .unwrap(); + pbar.inc(1); + }); +} diff --git a/contrib/code-file-concat/tests/test_concat.py b/contrib/code-file-concat/tests/test_concat.py new file mode 100644 index 00000000..876a4c98 --- /dev/null +++ b/contrib/code-file-concat/tests/test_concat.py @@ -0,0 +1,186 @@ +import json +import os +from pathlib import Path +import shlex +import subprocess +import tempfile +from typing import Any, Dict, List +import unittest +from uuid import uuid4 + + +FILE_SEPARATOR = "<|file_sep|>" +REPO_FIELD_NAME = "repo_name" +PL_FIELD_NAME = "language" + + +JAVASCRIPT_FILE_CONTENTS = [ + "function add(a, b) { return a + b; };", + "function subtract(a, b) { return a - b; };", +] + +PYTHON_FILE_CONTENTS = [ + "add = lambda a, b: a + b", + "subtract = lambda a, b: a - b", +] + + + +def mk_command( + input_dir: str, + output_dir: str, + randomize_order: bool, +) -> str: + return f""" +cargo run -- \ + --inputs '{input_dir}/*.jsonl' \ + --output {output_dir} \ + {'--randomize-order' if randomize_order else ''} \ + --file-separator-token '{FILE_SEPARATOR}' \ + --repo-field-name '{REPO_FIELD_NAME}' \ + --pl-field-name '{PL_FIELD_NAME}' +""".strip() + + +def mk_partition_files(dir: str, num_partitions: int, num_repos_per_partition: int) -> None: + row_jsons = [] + for i in range(num_repos_per_partition): + + # javascript + for javascript in JAVASCRIPT_FILE_CONTENTS: + row_json = json.dumps( + dict( + source="somesource", + version="1234", + id=str(uuid4()), + text=javascript, + metadata={ + PL_FIELD_NAME: "Javascript", + REPO_FIELD_NAME: f"repo-{i}" + } + ) + ) + row_jsons.append(row_json) + + # python + for python in PYTHON_FILE_CONTENTS: + row_json = json.dumps( + dict( + source="somesource", + version="1234", + id=str(uuid4()), + text=python, + metadata={ + PL_FIELD_NAME: "Python", + REPO_FIELD_NAME: f"repo-{i}" + } + ) + ) + row_jsons.append(row_json) + + lines = "\n".join(row_jsons) + + for i in range(num_partitions): + output_path = os.path.join(dir, f"{i}.jsonl") + with open(output_path, "w") as f: + f.write(lines) + + +def perform_concatenation( + num_partitions: int, + num_repos_per_partition: int, + randomize_order: float +) -> List[List[Dict[str, Any]]]: + with tempfile.TemporaryDirectory() as tmpdir: + input_dir = os.path.join(tmpdir, "input") + output_dir = os.path.join(tmpdir, "output") + os.mkdir(input_dir) + os.mkdir(output_dir) + mk_partition_files(input_dir, num_partitions, num_repos_per_partition) + cmd = mk_command(input_dir, output_dir, randomize_order) + + root_dir = _find_rust_root() + subprocess.run(shlex.split(cmd), check=True, cwd=root_dir) + + results = [] + + for output_file_name in sorted(os.listdir(output_dir)): + with open(os.path.join(output_dir, output_file_name), "r") as output_file: + output_dicts = [json.loads(line) for line in output_file.readlines()] + results.append(output_dicts) + + return results + +def _find_rust_root() -> Path: + rust_root = Path(__file__) + while True: + if rust_root == Path("/"): + raise FileNotFoundError("Could not find rust root") + if (rust_root / "Cargo.toml").exists(): + return rust_root + rust_root = rust_root.parent + + +class TestCodeFileConcat(unittest.TestCase): + def assert_partition_looks_good(self, repo_docs, expected_num_repos) -> None: + print(repo_docs) + self.assertEqual(len(repo_docs), expected_num_repos*2) + + for i in range(0, expected_num_repos, 2): + javascript_row, python_row = repo_docs[i], repo_docs[i+1] + + self.assertEqual(FILE_SEPARATOR.join(JAVASCRIPT_FILE_CONTENTS), javascript_row["text"]) + self.assertEqual(FILE_SEPARATOR.join(PYTHON_FILE_CONTENTS), python_row["text"]) + + self.assertEqual(javascript_row["metadata"][PL_FIELD_NAME], "Javascript") + self.assertEqual(python_row["metadata"][PL_FIELD_NAME], "Python") + + self.assertEqual(javascript_row["metadata"][REPO_FIELD_NAME], f"repo-{i//2}") + self.assertEqual(python_row["metadata"][REPO_FIELD_NAME], f"repo-{i//2}") + + def test__concatenation_works_in_simplest_case(self) -> None: + output_rows = perform_concatenation( + num_partitions=1, + num_repos_per_partition=1, + randomize_order=False, + ) + + self.assertEqual(len(output_rows), 1) + self.assert_partition_looks_good(output_rows[0], expected_num_repos=1) + + def test__concatenation_works_over_many_repos_and_partitions(self) -> None: + output_rows = perform_concatenation( + num_partitions=10, + num_repos_per_partition=4, + randomize_order=False + ) + + self.assertEqual(len(output_rows), 10) + + for partition in output_rows: + self.assert_partition_looks_good(partition, expected_num_repos=4) + + def test__randomized_order_works(self) -> None: + output_rows = perform_concatenation( + num_partitions=1, + num_repos_per_partition=10_000, + randomize_order=True, + ) + + self.assertEqual(len(output_rows[0]), 20_000) + + repo_docs_that_start_with_the_add_fn = [ + repo_doc + for repo_doc in output_rows[0] + if (repo_doc["text"].startswith(JAVASCRIPT_FILE_CONTENTS[0]) + and repo_doc["metadata"][PL_FIELD_NAME] == "Javascript") + or (repo_doc["text"].startswith(PYTHON_FILE_CONTENTS[0]) + and repo_doc["metadata"][PL_FIELD_NAME] == "Python") + ] + + self.assertAlmostEqual(len(repo_docs_that_start_with_the_add_fn) / 20_000, 0.5, 2) + + + + + diff --git a/contrib/fill-in-middle/.gitignore b/contrib/fill-in-middle/.gitignore new file mode 100644 index 00000000..b9b71011 --- /dev/null +++ b/contrib/fill-in-middle/.gitignore @@ -0,0 +1 @@ +data/output diff --git a/contrib/fill-in-middle/Cargo.lock b/contrib/fill-in-middle/Cargo.lock new file mode 100644 index 00000000..dc3ffb45 --- /dev/null +++ b/contrib/fill-in-middle/Cargo.lock @@ -0,0 +1,780 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cc" +version = "1.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "fill-in-middle" +version = "0.1.0" +dependencies = [ + "clap", + "mj_io", + "rand", + "rayon", + "regex", + "serde_json", +] + +[[package]] +name = "flate2" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi", + "windows-targets", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.170" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" + +[[package]] +name = "log" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" +dependencies = [ + "adler2", +] + +[[package]] +name = "mj_io" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d233748539aa5c1d0946ce06d14cc74dcb80b5a419b984d6e3f726cb05b749d" +dependencies = [ + "anyhow", + "flate2", + "glob", + "indicatif", + "zstd", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy 0.7.35", +] + +[[package]] +name = "proc-macro2" +version = "1.0.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha", + "rand_core", + "zerocopy 0.8.22", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive 0.7.35", +] + +[[package]] +name = "zerocopy" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09612fda0b63f7cb9e0af7e5916fe5a1f8cdcb066829f10f36883207628a4872" +dependencies = [ + "zerocopy-derive 0.8.22", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79f81d38d7a2ed52d8f034e62c568e111df9bf8aba2f7cf19ddc5bf7bd89d520" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3051792fbdc2e1e143244dc28c60f73d8470e93f3f9cbd0ead44da5ed802722" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.14+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fb060d4926e4ac3a3ad15d864e99ceb5f343c6b34f5bd6d81ae6ed417311be5" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/contrib/fill-in-middle/Cargo.toml b/contrib/fill-in-middle/Cargo.toml new file mode 100644 index 00000000..da9cdc95 --- /dev/null +++ b/contrib/fill-in-middle/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "fill-in-middle" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.29", features = ["derive"] } +mj_io = "0.1.2" +rand = "0.9.0" +rayon = "1.10.0" +regex = "1.11.1" +serde_json = "1.0.138" diff --git a/contrib/fill-in-middle/README.md b/contrib/fill-in-middle/README.md new file mode 100644 index 00000000..227b5a42 --- /dev/null +++ b/contrib/fill-in-middle/README.md @@ -0,0 +1,56 @@ +# Fill-In-Middle (FIM) + +A standalone-script used for producing reordered code text, +for use in training code completion between a given code block +prefix and suffix. + +For instance, given an example block of code like this: + +```plaintext +def add_two_numbers(a: int, b: int) -> int: + sum = a + b + return sum +``` + +we would expect a so-called FIM rearrangement +of this general format: + +```plaintext +<|fim_prefix>def add_two_numbers(a: int, b: int) -> int: +<|fim_suffix|> + return sum<|fim_middle|> sum = a + b +``` + +Note the use of special sentinel tokens to demarcate the beginnings +of each rearranged subsection. + +## Usage + +Examples to rewrite as FIM documents must be partitioned into 1 or more +jsonl files, where each row contains a member `"text"`. Partitioning and +ordering within partitions will be preserved in output files. + +If more than one code source file is present per `"text"` entry, delimit +with a separator token, e.g. `<|file_sep|>`. Reordering will only be applied +within a given source file. + +Here's an example on how to use the FIM script: + +```shell +cargo run --release -- \ + --inputs 'data/input/*jsonl' \ + --output data/output \ + --fim-rate 0.5 \ + --psm-spm-split 0.25 \ + --file-separator-token '<|file_sep|>' + --fim-prefix-token '<|fim_prefix|>' + --fim-middle-token '<|fim_middle|>' + --fim-suffix-token '<|fim_suffix|>' +``` + +It will run over all files matching `data/input/*.jsonl`, +writing results to `data/output`. FIM reordering will be applied +to 50% of source code files detected within the provided rows after splitting +on `<|file_sep|>`. Of those, 25% will be ordered as Prefix-Suffix-Middle (psm), +and the remaining 75% will be reordered as Suffix-Prefix-Middle (spm). + diff --git a/contrib/fill-in-middle/src/fim.rs b/contrib/fill-in-middle/src/fim.rs new file mode 100644 index 00000000..a4dca201 --- /dev/null +++ b/contrib/fill-in-middle/src/fim.rs @@ -0,0 +1,80 @@ +use rand::prelude::*; +use rand::rng; +use rand::seq::index::sample; + +pub struct FillInMiddle<'a> { + pub fim_rate: f32, + pub psm_spm_split: f32, + pub file_separator_token: &'a str, + pub fim_prefix_token: &'a str, + pub fim_middle_token: &'a str, + pub fim_suffix_token: &'a str, +} + +impl FillInMiddle<'_> { + pub fn perform_on_document_text(&mut self, document_text: &str) -> String { + let mut random = rng(); + + document_text + .split(&self.file_separator_token) + .map(|file_text| { + // Decide whether we're applying FIM to this file text + if &mut random.random::() < &mut self.fim_rate { + // Extract into unicode chars because of multi-byte characters + let file_chars: Vec = file_text.chars().collect(); + + // Exclude front and rear character indices we don't want to split at + let front_offset = 1; + let rear_offset = 1; + let range_clip = front_offset + rear_offset + 1; + + // Boundary condition: text is too short to rearrange + if range_clip > file_chars.len() || (file_chars.len() - range_clip) < 2 { + file_text.to_string() + } else { + let mut break_points: Vec = + sample(&mut random, file_chars.len() - range_clip, 2) + .into_iter() + .map(|index| index + front_offset) + .collect(); + break_points.sort(); + + // Slice out the chars and back to utf-8 strings + let prefix = file_chars[..break_points[0]].iter().collect::(); + let middle = file_chars[break_points[0]..break_points[1]] + .iter() + .collect::(); + let suffix = file_chars[break_points[1]..].iter().collect::(); + + if &mut random.random::() < &mut self.psm_spm_split { + // Reorder into Prefix-Suffix-Middle + format!( + "{}{}{}{}{}{}", + self.fim_prefix_token, + prefix, + self.fim_suffix_token, + suffix, + self.fim_middle_token, + middle + ) + } else { + // Reorder into Suffix-Prefix-Middle + format!( + "{}{}{}{}{}{}", + self.fim_suffix_token, + suffix, + self.fim_prefix_token, + prefix, + self.fim_middle_token, + middle + ) + } + } + } else { + file_text.to_string() + } + }) + .collect::>() + .join(&self.file_separator_token) + } +} diff --git a/contrib/fill-in-middle/src/main.rs b/contrib/fill-in-middle/src/main.rs new file mode 100644 index 00000000..3c7181c3 --- /dev/null +++ b/contrib/fill-in-middle/src/main.rs @@ -0,0 +1,211 @@ +use clap::Parser; +use std::io::{BufRead, Error}; +use std::path::PathBuf; + +use mj_io::{build_pbar, expand_dirs, read_pathbuf_to_mem, write_mem_to_pathbuf}; +use rayon::prelude::*; +use serde_json; + +mod fim; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Input files to process + #[arg(short, long, required = true)] + inputs: Vec, + + /// Destination output file + #[arg(short, long, required = true)] + output: PathBuf, + + /// Rate at which to perform FIM reordering + #[arg(long, required = true)] + fim_rate: f32, + + /// Rate at which to perform Prefix-Suffix-Middle vs Suffix-Prefix-Middle reordering + #[arg(long, required = true)] + psm_spm_split: f32, + + /// Value of the file separator token + #[arg(long, required = true, default_value = "<|file_sep|>")] + file_separator_token: String, + + /// Value of the fill-in-middle prefix sentinel token + #[arg(long, required = true, default_value = "<|fim_prefix|>")] + fim_prefix_token: String, + + /// Value of the fill-in-middle middle sentinel token + #[arg(long, required = true, default_value = "<|fim_middle|>")] + fim_middle_token: String, + + /// Value of the fill-in-middle suffix sentinel token + #[arg(long, required = true, default_value = "<|fim_suffix|>")] + fim_suffix_token: String, +} + +/// Compute the longest common prefix of a non-empty slice of paths. +fn compute_common_prefix(paths: &[PathBuf]) -> PathBuf { + if paths.is_empty() { + return PathBuf::new(); + } + + // Split each path into its components. + let components: Vec> = paths + .iter() + .map(|p| p.components().map(|c| c.as_os_str()).collect()) + .collect(); + + // Find the minimum number of components among all paths. + let min_len = components.iter().map(|comp| comp.len()).min().unwrap_or(0); + + let mut common = PathBuf::new(); + // Iterate index-by-index comparing components. + for i in 0..min_len { + let candidate = components[0][i]; + if components.iter().all(|comp| comp[i] == candidate) { + common.push(candidate); + } else { + break; + } + } + common +} + +/// Given a list of input paths and a destination prefix, +/// returns a vector where each file is the destination prefix plus +/// the input file’s path relative to the shared prefix. +fn map_paths_to_destination(inputs: &Vec, dest_prefix: PathBuf) -> Vec { + if inputs.is_empty() { + return Vec::new(); + } + + if inputs.len() == 1 { + // get filename from single input path + let src_filename = inputs[0].file_name().unwrap(); + let dst_filename = dest_prefix.join(src_filename); + return vec![dst_filename]; + } + + let common_prefix = compute_common_prefix(&inputs); + inputs + .into_iter() + .map(|input| { + // Calculate the relative path from the common prefix. + let relative = input + .strip_prefix(&common_prefix) + .expect("All inputs should share the common prefix"); + // Build the new destination path. + let mut new_path = dest_prefix.clone(); + new_path.push(relative); + new_path + }) + .collect() +} + +fn find_all_paths(inputs: Vec) -> Vec { + let all_paths: Vec = inputs + .into_iter() + .map(|path| { + let manual_ext: Option>; // Store Vec instead of &[&str] + let input_paths: Vec; + + match path.extension() { + Some(ext) => { + let ext_str = ext.to_string_lossy().into_owned(); // Convert to owned String + manual_ext = Some(vec![ext_str]); // Store as Vec + let mut trunk_path = path.clone(); + trunk_path.pop(); + input_paths = vec![trunk_path]; + } + None => { + manual_ext = None; + input_paths = vec![path]; + } + } + + // Convert Vec to Vec<&str> before passing it to expand_dirs + let manual_ext_refs: Option> = manual_ext + .as_ref() + .map(|v| v.iter().map(|s| s.as_str()).collect()); + expand_dirs(input_paths, manual_ext_refs.as_deref()).unwrap_or_default() + }) + .flatten() + .collect(); + return all_paths; +} + +fn process_single( + src_path: &PathBuf, + dst_path: &PathBuf, + fim_rate: f32, + psm_spm_split: f32, + file_separator_token: &str, + fim_prefix_token: &str, + fim_middle_token: &str, + fim_suffix_token: &str, +) -> Result<(), Error> { + let mut fim = fim::FillInMiddle { + fim_rate, + psm_spm_split, + file_separator_token, + fim_prefix_token, + fim_middle_token, + fim_suffix_token, + }; + + println!("Processing {:?} -> {:?}", src_path, dst_path); + let src_buf = read_pathbuf_to_mem(src_path).unwrap(); + let mut out_bytes: Vec = Vec::new(); + let newline: u8 = b'\n'; + + for line in src_buf.lines() { + let line = line.unwrap(); + let mut json_obj: serde_json::Value = serde_json::from_str(&line).unwrap(); + + let src_text = json_obj.get("text").unwrap().as_str().unwrap(); + let new_text = fim.perform_on_document_text(src_text); + json_obj["text"] = serde_json::Value::String(new_text); + + out_bytes.extend_from_slice(&serde_json::to_vec(&json_obj)?); + out_bytes.push(newline); + } + + write_mem_to_pathbuf(&out_bytes, dst_path).unwrap(); + Ok(()) +} + +fn main() { + // parse command line arguments + let args: Args = Args::parse(); + + // for each prefix, we derive both + let all_src: Vec = find_all_paths(args.inputs); + + println!("Found {} paths to process", all_src.len()); + + let all_dst = map_paths_to_destination(&all_src, args.output.clone()); + + let pbar = build_pbar(all_src.len(), "Processing files"); + + // here we can use rayon to parallelize the mapping operation + all_src + .into_iter() + .zip(all_dst.into_iter()) + .collect::>() + .par_iter() + .for_each(|(src_path, dst_path)| { + process_single( + src_path, + dst_path, + args.fim_rate, + args.psm_spm_split, + &args.file_separator_token, + &args.fim_prefix_token, + &args.fim_middle_token, + &args.fim_suffix_token, + ) + .unwrap(); + pbar.inc(1); + }); +} diff --git a/contrib/fill-in-middle/tests/test_fim.py b/contrib/fill-in-middle/tests/test_fim.py new file mode 100644 index 00000000..20b87119 --- /dev/null +++ b/contrib/fill-in-middle/tests/test_fim.py @@ -0,0 +1,240 @@ +import json +import os +from pathlib import Path +import re +import shlex +import subprocess +import tempfile +from typing import Any, Dict, List, Tuple +import unittest +from uuid import uuid4 + + +FILE_SEPARATOR = "<|file_sep|>" +FIM_MIDDLE_TOKEN = "<|fim_mid|>" +FIM_PREFIX_TOKEN = "<|fim_prefix|>" +FIM_SUFFIX_TOKEN = "<|fim_suffix|>" + + +CODE_FILE_1 = """ +def add_two_integers(a: int, b: int) -> int: + sum = a + b + return sum + + +def multiply_two_integers(a: int, b: int) -> int: + product = a + b + return product + + +def sum_and_multiple(a: int, b: int, c: int) -> int: + sum = add_two_integers(a, b) + product = multiply_two_integers(sum, c) + return product +""" + +CODE_FILE_2 = """ +from typing import Any + +import requests + + +def call_api(url: str, token: str, json: Dict[str, Any]) -> Dict[str, Any]: + result = requests.post( + url, + headers={"x-api-token": token}, + json=json + ) + + return result.json() +""" + + +def mk_command( + input_dir: str, + output_dir: str, + fim_rate: float, + psm_spm_split: float, +) -> str: + return f""" +cargo run -- \ + --inputs '{input_dir}/*.jsonl' \ + --output {output_dir} \ + --fim-rate {fim_rate} \ + --psm-spm-split {psm_spm_split} \ + --file-separator-token '{FILE_SEPARATOR}' \ + --fim-prefix-token '{FIM_PREFIX_TOKEN}' \ + --fim-middle-token '{FIM_MIDDLE_TOKEN}' \ + --fim-suffix-token '{FIM_SUFFIX_TOKEN}' +""".strip() + + +def mk_partition_files(dir: str, num_partitions: int, num_rows_per_partition: int, row_text: str) -> None: + for i in range(num_partitions): + output_path = os.path.join(dir, f"{i}.jsonl") + with open(output_path, "w") as f: + row_json = json.dumps( + dict(source="somesource", version="1234", id=str(uuid4()), text=row_text, metadata={}) + ) + lines = "\n".join([row_json] * num_rows_per_partition) + f.write(lines) + + +def mk_text(num_source_files: int) -> str: + source_files = [CODE_FILE_1 if j % 2 == 0 else CODE_FILE_2 for j in range(num_source_files)] + return FILE_SEPARATOR.join(source_files) + + +def perform_rewrites( + num_partitions: int, + num_rows_per_partition: int, + row_text: str, + fim_rate: float, + psm_spm_split: float, +) -> List[Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]]: + with tempfile.TemporaryDirectory() as tmpdir: + input_dir = os.path.join(tmpdir, "input") + output_dir = os.path.join(tmpdir, "output") + os.mkdir(input_dir) + os.mkdir(output_dir) + mk_partition_files(input_dir, num_partitions, num_rows_per_partition, row_text) + cmd = mk_command(input_dir, output_dir, fim_rate, psm_spm_split) + + root_dir = _find_rust_root() + subprocess.run(shlex.split(cmd), check=True, cwd=root_dir) + + input_file_names = sorted(os.listdir(input_dir)) + output_file_names = sorted(os.listdir(output_dir)) + + results = [] + + for input_file_name, output_file_name in zip(input_file_names, output_file_names): + with open(os.path.join(input_dir, input_file_name), "r") as input_file: + input_dicts = [json.loads(line) for line in input_file.readlines()] + with open(os.path.join(output_dir, output_file_name), "r") as output_file: + output_dicts = [json.loads(line) for line in output_file.readlines()] + results.append((input_dicts, output_dicts)) + + return results + + +def perform_single_row_rewrite( + row_text: str, fim_rate: float, psm_spm_split: float +) -> Tuple[Dict[str, Any], Dict[str, Any]]: + inputs, outputs = perform_rewrites( + num_partitions=1, + num_rows_per_partition=1, + row_text=row_text, + fim_rate=fim_rate, + psm_spm_split=psm_spm_split, + )[0] + + return inputs[0], outputs[0] + + +def _find_rust_root() -> Path: + rust_root = Path(__file__) + while True: + if rust_root == Path("/"): + raise FileNotFoundError("Could not find rust root") + if (rust_root / "Cargo.toml").exists(): + return rust_root + rust_root = rust_root.parent + + +class TestFillInMiddle(unittest.TestCase): + def test__fim_reordering_works(self) -> None: + # First, Prefix-Suffix-Middle + psm_input_row, psm_output_row = perform_single_row_rewrite( + row_text=mk_text(1), + fim_rate=1.0, + psm_spm_split=1.0, + ) + + psm_original_text = psm_input_row["text"] + psm_final_text = psm_output_row["text"] + + prefix_plus_prefix_token, rest = psm_final_text.split(FIM_SUFFIX_TOKEN) + _, prefix = prefix_plus_prefix_token.split(FIM_PREFIX_TOKEN) + suffix, middle = rest.split(FIM_MIDDLE_TOKEN) + + self.assertEqual(prefix + middle + suffix, psm_original_text) + + # Next, Suffix-Prefix-Middle + spm_input_row, spm_output_row = perform_single_row_rewrite( + row_text=mk_text(1), + fim_rate=1.0, + psm_spm_split=0, + ) + + spm_original_text = spm_input_row["text"] + spm_final_text = spm_output_row["text"] + + suffix_plus_suffix_token, rest = spm_final_text.split(FIM_PREFIX_TOKEN) + _, suffix = suffix_plus_suffix_token.split(FIM_SUFFIX_TOKEN) + prefix, middle = rest.split(FIM_MIDDLE_TOKEN) + + self.assertEqual(prefix + middle + suffix, spm_original_text) + + def test__fim_and_reordering_split_rates_work(self) -> None: + _, output_row = perform_single_row_rewrite( + row_text=mk_text(300_000), + fim_rate=0.5, + psm_spm_split=0.5, + ) + + final_text = output_row["text"] + files = final_text.split(FILE_SEPARATOR) + + self.assertEqual(len(files), 300_000) + + psm_reordered = 0 + spm_reordered = 0 + + psm_match = r"<\|fim_prefix\|>.+<\|fim_suffix\|>.+<\|fim_mid\|>.+" + spm_match = r"<\|fim_suffix\|>.+<\|fim_prefix\|>.+<\|fim_mid\|>.+" + + for file in files: + for _ in re.finditer(psm_match, file, re.DOTALL): + psm_reordered += 1 + for _ in re.finditer(spm_match, file, re.DOTALL): + spm_reordered += 1 + + self.assertAlmostEqual((psm_reordered + spm_reordered) / 300_000, 0.5, 2) + self.assertAlmostEqual(psm_reordered / (psm_reordered + spm_reordered), 0.5, 2) + + def test__fim_needs_at_least_five_characters_to_rearrange(self) -> None: + for i in range(5): + starting_string = "a" * i or "" + final_string = perform_single_row_rewrite( + row_text=starting_string, + fim_rate=1, + psm_spm_split=1, + )[ + 0 + ]["text"] + + if i < 5: + self.assertEqual(final_string, starting_string) + else: + self.assertTrue(FIM_PREFIX_TOKEN in final_string) + + def test__fim_handles_lots_of_partitions_with_lots_of_rows(self) -> None: + results = perform_rewrites( + num_partitions=5, num_rows_per_partition=2, row_text=mk_text(10), fim_rate=1, psm_spm_split=1 + ) + + self.assertEqual(len(results), 5) + + for inputs, outputs in results: + self.assertEqual(len(inputs), 2) + self.assertEqual(len(outputs), 2) + + psm_match = r"<\|fim_prefix\|>.+<\|fim_suffix\|>.+<\|fim_mid\|>.+(<\|file_sep\|>)?" + + for output in outputs: + num_rewrites = 0 + files = output["text"].split(FILE_SEPARATOR) + for file in files: + num_rewrites += len([re.finditer(psm_match, file, re.DOTALL)]) + self.assertEqual(num_rewrites, 10) diff --git a/contrib/tokens-sanitizer/.gitignore b/contrib/tokens-sanitizer/.gitignore new file mode 100644 index 00000000..b9b71011 --- /dev/null +++ b/contrib/tokens-sanitizer/.gitignore @@ -0,0 +1 @@ +data/output diff --git a/contrib/tokens-sanitizer/Cargo.lock b/contrib/tokens-sanitizer/Cargo.lock new file mode 100644 index 00000000..b8b7ecdd --- /dev/null +++ b/contrib/tokens-sanitizer/Cargo.lock @@ -0,0 +1,657 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "cc" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "flate2" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" +dependencies = [ + "adler2", +] + +[[package]] +name = "mj_io" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d233748539aa5c1d0946ce06d14cc74dcb80b5a419b984d6e3f726cb05b749d" +dependencies = [ + "anyhow", + "flate2", + "glob", + "indicatif", + "zstd", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "pkg-config" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" + +[[package]] +name = "serde" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.138" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tokens-sanitizer" +version = "0.1.0" +dependencies = [ + "clap", + "mj_io", + "rayon", + "regex", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/contrib/tokens-sanitizer/Cargo.toml b/contrib/tokens-sanitizer/Cargo.toml new file mode 100644 index 00000000..7579a56f --- /dev/null +++ b/contrib/tokens-sanitizer/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tokens-sanitizer" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.29", features = ["derive"] } +mj_io = "0.1.2" +rayon = "1.10.0" +regex = "1.11.1" +serde_json = "1.0.138" diff --git a/contrib/tokens-sanitizer/README.md b/contrib/tokens-sanitizer/README.md new file mode 100644 index 00000000..ff99263c --- /dev/null +++ b/contrib/tokens-sanitizer/README.md @@ -0,0 +1,103 @@ +# Tokens Sanitizer + +This script is designed to sanitize a documents by replacing special tokens with properly spaced versions. + +Why would you want to do that? +Usually, when tokenizing data for pretraining, we split any special token that occur naturally in the text before adding special tokens to denote end of documents. +This is useful for cases when you want a model to be able to answer questions such as "what is <|endoftext|>?"; +if you don't escape special tokens properly, the model would conflate different uses of `<|endoftext|>`, and likely would never learn its meta-semantics (since it is vastly more likely to be used to indicate the end of a document). + +However, in some cases, data pipeline might inject special tokens that should be treated as such: +For example, when concatenating files in a code repository, you might want to use `<|repo_name|>author/repo<|file_sep|>` to prefix all your code documents. + +In that case, we use the following strategy: + +1. We first run this sanitization script to process all original documents; if used as described below, this script would take care of modifying special tokens in a way that is equivalent to them being escaped. +2. Then, we perform whatever text substitution we want to inject special tokens. +3. Finally, we tokenize, but, this time, we set `split_special_tokens=False`, since sanitization script already took care of encoding. + +**⚠️ IMPORTANT ⚠️** + +You must use a **tokenizer that has been modified** to support sanitization. In the rest of this Readme, we use [`allenai/dolma2-tokenizer-U10F0F0`](https://huggingface.co/allenai/dolma2-tokenizer-U10F0F0). + + +## How does the sanitization work? + +When running the sanitization script, we inject a Unicode code point from [Supplementary Private Use Area-B (SPUA-B)](https://en.wikipedia.org/wiki/Private_Use_Areas). +The Unicode consortium does not assign symbols to these code points; rather, they are left open for private entities to use in their own software. +While the Basic Private Use Area (U+E000 to U+F8FF) is commonly used (for example, Apple uses `U+F8FF` for the Apple logo ), the supplementary area B is exceedingly rare. +I picked `U+10F0F0` for sanitization. + +In order for sanitization to work, we must use a tokenizer that is designed to split of this token as part of its pre-tokenization strategy. +In HuggingFace Tokenizers library, we achieve that by adding the follow pre-tokenization rule: + +```json + ... + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "String": "􏃰" + }, + "behavior": "Removed", + "invert": false + }, + ... +``` + +## Usage + +Here's an example on how to use this sanitization script. + +```shell +cargo run --release -- \ + --inputs 'data/input/*jsonl' \ + --output data/output \ + --substitutions '<|endoftext|>=<|􏃰endoftext|>' \ + --substitutions '<|fim_prefix|>=<|􏃰fim_prefix|>' \ + --substitutions '<|fim_middle|>=<|􏃰fim_middle|>' \ + --substitutions '<|fim_suffix|>=<|􏃰fim_suffix|>' \ + --substitutions '<|im_start|>=<|􏃰im_start|>' \ + --substitutions '<|im_end|>=<|􏃰im_end|>' \ + --substitutions '<|endofprompt|>=<|􏃰endofprompt|>' \ + --substitutions '<|pad|>=<|􏃰pad|>' \ + --substitutions '<|repo_name|>=<|􏃰repo_name|>' \ + --substitutions '<|file_sep|>=<|􏃰file_sep|>' +``` + +If you are tokenizing using Dolma toolkit, you should tokenize using the following config + +```yaml +destination: ... +documents: + - ... + +processes: ... +seed: ... +max_size: ... +dtype: uint32 + +tokenizer: + name_or_path: allenai/dolma2-tokenizer-U10F0F0 + bos_token_id: null + eos_token_id: 100257 + pad_token_id: 100277 + segment_before_tokenization: false + encode_special_tokens: false +``` + +Note the `encode_special_tokens` and `name_or_path` keys. + +If you are using Hugging Face `transformers`, make sure to set `split_special_tokens` to `False`: + +```python +from transformers import AutoTokenizer + +tok = AutoTokenizer.from_pretrained("allenai/dolma2-tokenizer-U10F0F0") + +text = ... + +tok(text, split_special_tokens=False) +``` diff --git a/contrib/tokens-sanitizer/data/input/f1.jsonl b/contrib/tokens-sanitizer/data/input/f1.jsonl new file mode 100644 index 00000000..43f00e45 --- /dev/null +++ b/contrib/tokens-sanitizer/data/input/f1.jsonl @@ -0,0 +1 @@ +{"text":"ChatGPT uses <|endoftext|> to denote the end of a text sequence."} diff --git a/contrib/tokens-sanitizer/src/main.rs b/contrib/tokens-sanitizer/src/main.rs new file mode 100644 index 00000000..03ed8ec0 --- /dev/null +++ b/contrib/tokens-sanitizer/src/main.rs @@ -0,0 +1,182 @@ +use clap::Parser; +use std::io::{BufRead, Error}; +use std::path::PathBuf; + +use regex::Regex; +use serde_json; +use mj_io::{expand_dirs, read_pathbuf_to_mem, write_mem_to_pathbuf, build_pbar}; +use rayon::prelude::*; + + +fn parse_key_val(s: &str) -> Result<(String, String), String> { + let re = Regex::new(r#"[^\\](?=)"#).unwrap(); + let pos = re.find(s).ok_or_else(|| "No '=' found in {:?}".to_string())?; + Ok((s[..pos.start() + 1].to_string(), s[pos.end()..].to_string())) +} + + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Input files to process + #[arg(short, long, required = true)] + inputs: Vec, + + /// Destination output file + #[arg(short, long, required = true)] + output: PathBuf, + + /// Substitutions in the form KEY=VALUE + #[arg(short, long, required = true, value_parser = parse_key_val)] + substitutions: Vec<(String, String)>, +} + + +/// Compute the longest common prefix of a non-empty slice of paths. +fn compute_common_prefix(paths: &[PathBuf]) -> PathBuf { + if paths.is_empty() { + return PathBuf::new(); + } + + // Split each path into its components. + let components: Vec> = paths + .iter() + .map(|p| p.components().map(|c| c.as_os_str()).collect()) + .collect(); + + // Find the minimum number of components among all paths. + let min_len = components.iter().map(|comp| comp.len()).min().unwrap_or(0); + + let mut common = PathBuf::new(); + // Iterate index-by-index comparing components. + for i in 0..min_len { + let candidate = components[0][i]; + if components.iter().all(|comp| comp[i] == candidate) { + common.push(candidate); + } else { + break; + } + } + common +} + +/// Given a list of input paths and a destination prefix, +/// returns a vector where each file is the destination prefix plus +/// the input file’s path relative to the shared prefix. +fn map_paths_to_destination(inputs: &Vec, dest_prefix: PathBuf) -> Vec { + if inputs.is_empty() { + return Vec::new(); + } + + if inputs.len() == 1 { + // get filename from single input path + let src_filename = inputs[0].file_name().unwrap(); + let dst_filename = dest_prefix.join(src_filename); + return vec![dst_filename]; + } + + let common_prefix = compute_common_prefix(&inputs); + inputs + .into_iter() + .map(|input| { + // Calculate the relative path from the common prefix. + let relative = input + .strip_prefix(&common_prefix) + .expect("All inputs should share the common prefix"); + // Build the new destination path. + let mut new_path = dest_prefix.clone(); + new_path.push(relative); + new_path + }) + .collect() +} + + +fn find_all_paths(inputs: Vec) -> Vec { + let all_paths: Vec = inputs.into_iter().map( + |path| { + let manual_ext: Option>; // Store Vec instead of &[&str] + let input_paths: Vec; + + match path.extension() { + Some(ext) => { + let ext_str = ext.to_string_lossy().into_owned(); // Convert to owned String + manual_ext = Some(vec![ext_str]); // Store as Vec + let mut trunk_path = path.clone(); + trunk_path.pop(); + input_paths = vec![trunk_path]; + } + None => { + manual_ext = None; + input_paths = vec![path]; + } + } + + // Convert Vec to Vec<&str> before passing it to expand_dirs + let manual_ext_refs: Option> = manual_ext.as_ref().map( + |v| + v.iter().map(|s| s.as_str()).collect() + ); + expand_dirs(input_paths, manual_ext_refs.as_deref()).unwrap_or_default() + } + ).flatten().collect(); + return all_paths; +} + + +fn apply_substitutions(s: &str, subs: &[(String, String)]) -> Result { + let mut result = s.to_string(); + for (key, val) in subs { + result = result.replace(key, val); + } + Ok(result) +} + + +fn process_single(src_path: &PathBuf, dst_path: &PathBuf, subs: &[(String, String)]) -> Result<(), Error> { + + println!("Processing {:?} -> {:?}", src_path, dst_path); + let src_buf = read_pathbuf_to_mem(src_path).unwrap(); + let mut out_bytes: Vec = Vec::new(); + let newline: u8 = b'\n'; + + for line in src_buf.lines() { + let line = line.unwrap(); + let mut json_obj: serde_json::Value = serde_json::from_str(&line).unwrap(); + + let src_text = json_obj.get("text").unwrap().as_str().unwrap(); + let new_text = apply_substitutions(src_text, subs).unwrap(); + json_obj["text"] = serde_json::Value::String(new_text); + + out_bytes.extend(serde_json::to_vec(&json_obj).unwrap()); + out_bytes.push(newline); + } + + write_mem_to_pathbuf(&out_bytes, dst_path).unwrap(); + Ok(()) +} + + + +fn main() { + // parse command line arguments + let args: Args = Args::parse(); + + // for each prefix, we derive both + let all_src: Vec = find_all_paths(args.inputs); + + println!("Found {} paths to process", all_src.len()); + + let all_dst = map_paths_to_destination(&all_src, args.output.clone()); + + let pbar = build_pbar(all_src.len(), "Processing files"); + + // here we can use rayon to parallelize the mapping operation + all_src.into_iter().zip(all_dst.into_iter()).collect::>().par_iter().for_each( + |(src_path, dst_path)|{ + process_single(src_path, dst_path, &args.substitutions).unwrap(); + pbar.inc(1); + } + ); + +} diff --git a/contrib/tokens-sanitizer/tests/test_sanitizer.py b/contrib/tokens-sanitizer/tests/test_sanitizer.py new file mode 100644 index 00000000..52b0d462 --- /dev/null +++ b/contrib/tokens-sanitizer/tests/test_sanitizer.py @@ -0,0 +1,62 @@ +import json +import shlex +import subprocess +from pathlib import Path + +from transformers import AutoTokenizer + +MOD_DOLMA_TOKENIZER = "allenai/dolma2-tokenizer-U10F0F0" +OG_DOLMA_TOKENIZER = "allenai/dolma2-tokenizer" + +CMD = """ +cargo run -- \ + --inputs 'data/input/*.jsonl' \ + --output data/output \ + --substitutions '<|endoftext|>=<|􏃰endoftext|>' \ + --substitutions '<|fim_prefix|>=<|􏃰fim_prefix|>' \ + --substitutions '<|fim_middle|>=<|􏃰fim_middle|>' \ + --substitutions '<|fim_suffix|>=<|􏃰fim_suffix|>' \ + --substitutions '<|im_start|>=<|􏃰im_start|>' \ + --substitutions '<|im_end|>=<|􏃰im_end|>' \ + --substitutions '<|endofprompt|>=<|􏃰endofprompt|>' \ + --substitutions '<|pad|>=<|􏃰pad|>' +""" + + +def _find_rust_root() -> Path: + rust_root = Path(__file__) + while True: + if rust_root == Path("/"): + raise FileNotFoundError("Could not find rust root") + if (rust_root / "Cargo.toml").exists(): + return rust_root + rust_root = rust_root.parent + + +def test_sanitizer(): + og_tok = AutoTokenizer.from_pretrained(OG_DOLMA_TOKENIZER) + mod_tok = AutoTokenizer.from_pretrained(MOD_DOLMA_TOKENIZER) + + root_dir = _find_rust_root() + subprocess.run(shlex.split(CMD), check=True, cwd=root_dir) + + input_dir = root_dir / "data" / "input" + output_dir = root_dir / "data" / "output" + + for input_file in input_dir.glob("*.jsonl"): + with input_file.open("r") as f: + input_docs = [json.loads(line) for line in f] + + output_file = output_dir / input_file.name + with output_file.open("r") as f: + output_docs = [json.loads(line) for line in f] + + for input_doc, output_doc in zip(input_docs, output_docs): + input_tokens = og_tok.tokenize(input_doc["text"], split_special_tokens=True) + output_tokens = mod_tok.tokenize(output_doc["text"], split_special_tokens=False) + + assert input_tokens == output_tokens + + +if __name__ == "__main__": + test_sanitizer() diff --git a/tests/data/multiple_files/cc_en_head-0091.jsonl b/tests/data/multiple_files/cc_en_head-0091.jsonl new file mode 100644 index 00000000..8d416f97 --- /dev/null +++ b/tests/data/multiple_files/cc_en_head-0091.jsonl @@ -0,0 +1,10 @@ +{"added":"2023-04-07T15:24:38.743534+00:00","created":"2020-03-29T09:04:10Z","id":"http://100kinvesting.com/2016/11/28/first-teleconference-calls-invoice-factoring/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T09:04:10Z","digest":"sha1:DNJVF5QUTTJGEF56WA5VCYUVVD3OW7XL","language":"en","language_score":0.95,"length":569,"line_ids":[41,42,56,57,59],"nlines":5,"original_length":2734,"original_nlines":105,"perplexity":304.6,"provenance":"cc_en_head-0112.json.gz:1","source_domain":"100kinvesting.com","title":"First Teleconference Calls for Invoice Factoring, and more. — 100K Investing, LLC.","url":"http://100kinvesting.com/2016/11/28/first-teleconference-calls-invoice-factoring/"},"source":"common-crawl","text":"We will be starting to hold weekly teleconferences for businesses that may be interested in invoice factoring. In addition, there will be guest speakers to discuss seller financed mortgages, annuity sales, probate advances, and more.\nIf you're in or around any of these areas, check out the teleconference for invoice factoring. Note that these are open to the public, and anyone with a business that has invoices is welcome to attend."} +{"added":"2023-04-07T15:24:38.763268+00:00","created":"2020-03-29T08:44:21Z","id":"http://100women.ng/category/celebrating-female-entrepreneurs/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T08:44:21Z","digest":"sha1:YV4FOGWSHA54G3CMQZQ7GSFGSPDW4O5X","language":"en","language_score":0.96,"length":2059,"line_ids":[31,52,59,60,61,62,63,65,70,73,74,75,76,77,89,92,93,95,96,97,98,99,106,115,117,118,119,120,121,128],"nlines":30,"original_length":5880,"original_nlines":188,"perplexity":320.1,"provenance":"cc_en_head-0112.json.gz:2","source_domain":"100women.ng","title":"celebrating female entrepreneurs Archives - SME100women","url":"http://100women.ng/category/celebrating-female-entrepreneurs/"},"source":"common-crawl","text":"pressure of the work, has endeared her to many motorists and passerby.\nheadmaster which was created and directed by her husband ambassador Segun Olusola.\nit ran for over a decade.\nThe beginning of a new year is often the time for personal and organizational projections, especially financial planning for the year ahead. Indeed, making a rock-solid financial plan should be at the heart of any smart year plan, as financial security has far-reaching implications for our lives and the organizations we operate in."} +{"added":"2023-04-07T15:24:39.101492+00:00","created":"2020-03-29T09:11:03Z","id":"http://2011.rubyworld-conf.org/en/program/A-9/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T09:11:03Z","digest":"sha1:3IEEKFBVIQ5ATHW4NMB4GNAXRCD5DVII","language":"en","language_score":0.91,"length":1524,"line_ids":[0,1,8,9,11,13,15],"nlines":7,"original_length":2191,"original_nlines":36,"perplexity":270.8,"provenance":"cc_en_head-0112.json.gz:3","source_domain":"2011.rubyworld-conf.org","title":"A-9 Sulayman K. Sowe : Sustainable Programming for Africa Project: Ruby SPA | RubyWorld Conference 2011","url":"http://2011.rubyworld-conf.org/en/program/A-9/"},"source":"common-crawl","text":"This talk presents the Ruby SPA project to be jointly undertaken by Japanese and African partners. The main aim of the Ruby SPA project is to provide the knowledge and skills the next generation of African software engineers, computer scientists, and information systems managers will need to meet the challenges of the information society and actively participate in the developments of sustainable technologies to solve practical problems affecting their locality and the global community. Some of the main objectives of Ruby SPA are to provide the African partners the infrastructure and teaching and learning material needed to effectively teach Ruby programming in African universities."} +{"added":"2023-04-07T15:24:40.617377+00:00","created":"2020-03-29T09:55:33Z","id":"http://911blogger.com/news/2006-11-02/pentagon-video-doubletree-be-released-within-week","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T09:55:33Z","digest":"sha1:I65J5PE6ZEC54PHELB5BYJYTEZKIKXUL","language":"en","language_score":0.95,"length":108593,"line_ids":[7,10,11,12,13,14,15,16,18,19,20,25,29,33,34,41,43,44,48,49,50,51,58,59,60,69,70,75,82,83,84,85,86,90,91,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,161,162,163,164,165,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,201,202,209,210,217,226,229,233,234,243,244,245,249,250,251,252,257,258,259,263,264,265,269,270,275,276,278,282,283,291,293,294,295,296,297,302,303,304,305,309,310,311,315,316,317,318,323,324,325,330,331,332,333,334,335,340,341,342,346,347,353,354,355,360,361,365,366,367,368,369,370,371,376,377,378,382,383,384,385,386,387,388,389,390,394,395,396,397,402,403,404,405,409,410,411,415,416,417,418,419,424,425,426,427,440,441,448,449,450,455,456,457,458,459,464,465,469,473,474,479,480,481,482,483,484,487,491,492,493,494,495,496,497,498,499,500,501,506,507,514,515,522,523,524,529,530,531,532,533,540,541,542,549,550,551,552,553,554,555,559,560,561,562,563,564,565,566,567,568,569,572,576,585,586,591,592,600,601,602,603,604,605,609,610,615,616,621,622,623,627,628,629,637,638,639,640,641,645,646,647,648,653,654,655,663,664,669,670,671,672,673,674,678,679,680,684,685,686,687,691,692,697,698,699,700,701,705,706,714,715,720,721,726,727,728,733,734,738,739,745,746,751,752,753,758,759,760,765,767,768,773,774,775,779,780,781,786,788,789,790,791,792,796,797,798,799,803,804,805,806,810,811,816,817,818,819,820,826,827,828,833,840,841,849,850,855,856,857,862,863,864,865,870,871,872,880,882,883,888,895,896,897,898,899,903,904,905,912,913,914,915,916,917,918,919,924,925,926,927,928,929,933,934,942,943,944,950,951,952,956,957,958,959,964,965,966,967,972,973,978,979,980,984,985,986,987,988,989,994,995,996,997,1001,1002,1003,1004,1009,1010,1014],"nlines":484,"original_length":128189,"original_nlines":1060,"perplexity":296.3,"provenance":"cc_en_head-0112.json.gz:7","source_domain":"911blogger.com","title":"Pentagon Video From Doubletree To Be Released Within a Week | 911Blogger.com","url":"http://911blogger.com/news/2006-11-02/pentagon-video-doubletree-be-released-within-week"},"source":"common-crawl","text":"View from Doubletree Hotel resteraunt showing the Pentagon (red arrow). Source.\nSo how much do you want to bet that this Doubletree Hotel security video will be released before this Tuesday (election day) to “shock & awe” the voters in hopes to sway the elections, especially if this video finally shows a plane hitting the Pentagon?\nCheck out the full article via the link above.\nThanks Killtown for the reminder!\nit was always said it would be released by 11/9...\nDHS on Thu, 11/02/2006 - 6:21pm.\nI've always said it was Flight 77.\nI'm not saying you are wrong...just how did you draw those conclusions?\ncard51short on Thu, 11/02/2006 - 6:29pm.\nFrom talking with people...\nLike John Judge, and from research presented by www.pentagonresearch.com, and the fact that the argument is a waste of time in and of itself. In my opinion anyway.\nThe argument is a waste of time, to me, because it's one of those discussions that could go on and on and on and on and on, and ultimately, you never come to a conclusion. Reason being, there's no video (to date) that shows it.\nSo, I focus on different things... which people are tired of hearing me say, but that's the way it is.\nHow is it possible that 34 minutes after the SECOND tower was hit, when even Joe Schmoe's like me knew America was \"under attack\", that a hijacked plane with kamikaze pilots managed to penetrate Washington D.C. airspace, the most defended airspace in the world? Don't you think it would have been intelligent of them to deploy fighters over the skies of D.C. RIGHT when the attacks occurred? Why weren't they? Who was in charge of making those kinds of decisions?\nNorman Mineta testified that he arrived at the Presidential Emergency Operations Center at 9:20, and Dick Cheney was already there. By 9:25, a young man came in and out of the room three times to tell Cheney the position of Flight 77, and asked for confirmation of orders. Orders that couldn't have been a \"shoot-down\" order because according to the latest Vanity Fair piece, fighter pilots claimed by 10:10:31, they had, \"negative clearance to fire.\" So what order was it? Who is the young man, and why didn't he testify before the 9/11 Commission? Why did Dick Cheney claim that he had given a \"shoot down\" order? Why did the 9/11 Commission claim that Dick Cheney arrived at the PEOC by 9:58am? Why did the 9/11 Report completely omit Norman Mineta's testimony?\nObviously, details like inept patsies and protective moles leading to multi-billion-dollar defense failures and ultimately \"Angel is next\" are much harder to deal with than throwing the Popular Mechanics propeller heads at Dylan Avery and crew. For any of the leadership to go off SOP while the \"nation is under attack\" is treasonable. They will testify only with the threat of the noose before them. Treason for the lot of them!\nblog dog (not verified) on Thu, 11/02/2006 - 7:44pm.\nI'd rather be proven wrong than right. If a video comes out that shows Flight 77 hitting the Pentagon, it's gonna hurt.\nIts probably a plane, but it wasn't 77 based on radar data.\nTerrorstorm is #21 at Amazon!!!\nSomebigguy on Thu, 11/02/2006 - 8:37pm.\nRadar data are you referring to?\nI very much apologize for this rather long posting, but I think it is important for this thread.\nAugust 2001: Hani Hanjour Successfully Takes Certification Flight?\n(After 8:56-9:24 a.m.): Pentagon Emergency Center Knows Flight 77 Is Hijacked; NORAD Not Notified?\nIndianapolis flight control reports the loss of contact with Flight 77 to the FAA regional center. They describe it as a possible crash. The center waits 16 minutes before passing the information to FAA headquarters at 9:25 a.m. (see 9:25 a.m.) [Washington Post, 11/3/2001; 9/11 Commission, 6/17/2004] However, American Airlines headquarters has been notified of the same information before 9:00 a.m. (see (Before 9:00 a.m.)).\nAccording to counterterrorism “tsar” Richard Clarke and others, Vice President Cheney goes from his White House office to the Presidential Emergency Operations Center (PEOC), a bunker in the East Wing of the White House, at about this time. National Security Adviser Rice, after initiating a video conference with Richard Clarke in the West Wing, goes to the PEOC to be with Cheney. There is no video link between response centers in the East and West Wings, but a secure telephone line is used instead. [Clarke, 2004, pp. 3-4; ABC News, 9/14/2002; New York Times, 9/16/2001; Daily Telegraph, 12/16/2001] One eyewitness account, David Bohrer, a White House photographer, says Cheney leaves for the PEOC just after 9:00 a.m. [ABC News, 9/14/2002] However, there is a second account claiming that Cheney doesn’t leave until sometime after 9:30 a.m. In this account, Secret Service agents burst into Cheney’s White House office. They carry him under his arms—nearly lifting him off the ground—and propel him down the steps into the White House basement and through a long tunnel toward an underground bunker. [Washington Post, 1/27/2002; BBC, 9/1/2002; Newsweek, 12/31/2001; New York Times, 10/16/2001; MSNBC, 9/11/2002; 9/11 Commission, 6/17/2004] At about the same time, National Security Adviser Rice is told to go to the bunker as well. [ABC News, 9/11/2002] In addition to the eyewitness accounts of Clarke and Bohrer, ABC News claims that Cheney is in the bunker when he is told Flight 77 is 50 miles away from Washington at 9:27 a.m., suggesting that accounts of Cheney entering the bunker after 9:27 a.m. are likely incorrect.\nRenee May, a flight attendant on Flight 77, uses a cell phone to call her mother in Las Vegas. She tells her mother that the flight has been hijacked, and that everyone has been asked to move to the back of the plane. She asks her mother to call American Airlines and let them know Flight 77 has been hijacked. Her mother (Nancy May) calls the airline. [Las Vegas Review-Journal, 9/13/2001; Las Vegas Review-Journal, 9/15/2001; 9/11 Commission, 1/27/2004; San Francisco Chronicle, 7/23/2004] American Airlines headquarters is already aware that Flight 77 is hijacked, but supposedly Indianapolis flight control covering the flight still is not told.\nIn a government report analyzing the effectiveness of rescue worker response to the Pentagon crash, it is mentioned that, “At about 9:20 a.m., the WFO [FBI Washington Field Office] Command Center [is] notified that American Airlines Flight 77 had been hijacked shortly after takeoff from Washington Dulles International Airport. [Special Agent in Charge Arthur] Eberhart dispatche[s] a team of 50 agents to investigate the Dulles hijacking and provide additional security to prevent another. He sen[ds] a second team to Ronald Reagan Washington National Airport as a precautionary step. At the WFO Command Center, Supervisory Special Agent (SSA) Jim Rice [is] on the telephone with the Pentagon when Flight 77 crashe[s] into the building.” [US Department of Health and Human Services, 7/2002] Yet according to the 9/11 Commission, NORAD is not told that Flight 77 had been hijacked at this time or any time before it crashes. However, the FAA has claimed they officially warned NORAD at 9:24 a.m. (see (9:24 a.m.)) and informally warned them even earlier (see (9:24 a.m.)).\nA passenger on Flight 77, Barbara Olson, calls her husband, Theodore (Ted) Olson, who is Solicitor General at the Justice Department. [San Francisco Chronicle, 7/23/2004] Ted Olson is in his Justice Department office watching WTC news on television when his wife calls. A few days later, he says, “She told me that she had been herded to the back of the plane. She mentioned that they had used knives and box cutters to hijack the plane. She mentioned that the pilot had announced that the plane had been hijacked.” [CNN, 9/14/2001] He tells her that two planes have hit the WTC. [Daily Telegraph, 3/5/2002] She feels nobody is taking charge. [CNN, 9/12/2001] He doesn’t know if she was near the pilots, but at one point she asks, “What shall I tell the pilot? What can I tell the pilot to do?” [CNN, 9/14/2001] Then she is cut off without warning. [Newsweek, 9/29/2001] Ted Olson’s recollection of the call’s timing is extremely vague, saying it “must have been 9:15 [am.] or 9:30 [am.]. Someone would have to reconstruct the time for me.” [CNN, 9/14/2001] Other accounts place it around 9:25 a.m. [Miami Herald, 9/14/2001; New York Times, 9/15/2001; Washington Post, 9/21/2001] The call is said to have lasted about a minute. [Washington Post, 9/12/2001] By some accounts, his message that planes have hit the WTC comes later, in a second phone call. [Washington Post, 9/21/2001] In one account, Barbara Olson calls from inside a bathroom. [Evening Standard, 9/12/2001] In another account, she is near a pilot, and in yet another she is near two pilots. [Boston Globe, 11/23/2001] Ted Olson’s account of how Barbara Olson made her calls is also conflicting. Three days after 9/11, he says, “I found out later that she was having, for some reason, to call collect and was having trouble getting through. You know how it is to get through to a government institution when you’re calling collect.” He says he doesn’t know what kind of phone she used, but he has “assumed that it must have been on the airplane phone, and that she somehow didn’t have access to her credit cards. Otherwise, she would have used her cell phone and called me.” [Hannity & Colmes, 9/14/2001] Why Barbara Olson would have needed access to her credit cards to call him on her cell phone is not explained. However, in another interview on the same day, he says that she used a cell phone and that she may have been cut off “because the signals from cell phones coming from airplanes don’t work that well.” [CNN, 9/14/2001] Six months later, he claims she called collect “using the phone in the passengers’ seats.” [Daily Telegraph, 3/5/2002] However, it is not possible to call on seatback phones, collect or otherwise, without a credit card, which would render making a collect call moot. Many other details are conflicting, and Olson faults his memory and says that he “tends to mix the two [calls] up because of the emotion of the events.” [CNN, 9/14/2001] The couple liked to joke that they were at the heart of what Hillary Clinton famously called a “vast, right-wing conspiracy.” Ted Olson has been a controversial choice as Solicitor General since he argued on behalf of Bush before the Supreme Court in the 2000 presidential election controversy before being nominated for his current position.\nAccording to the 9/11 Commission, the FAA Command Center advises the Dulles Airport terminal control facility in Washington to look for primary targets. [9/11 Commission, 6/17/2004] By at least one account, Dulles notices Flight 77 a few minutes later.\nAccording to the 9/11 Commission, the FAA Command Center advises FAA headquarters that American 77 is lost in Indianapolis flight control’s airspace, that Indianapolis has no primary radar track, and is looking for the aircraft. [9/11 Commission, 6/17/2004] The Command Center had learned this 16 minutes earlier at 9:09 a.m. (see 9:09 a.m.). American Airlines headquarters was notified of the same information before 9:00 a.m. (see (Before 9:00 a.m.)).\nRadar tracks Flight 77 as it closes within 30 miles of Washington. [CBS News, 9/21/2001] Todd Lewis, flight controller at Washington’s Dulles Airport, later recalls, “... my colleagues saw a target moving quite fast from the northwest to the southeast. So she—we all started watching that target, and she notified the supervisor. However, nobody knew that was a commercial flight at the time. Nobody knew that was American 77. ... I thought it was a military flight.” [MSNBC, 9/11/2002] Another account is similar, saying that just before 9:30 a.m., a Dulles Airport controller sees an aircraft without a transponder traveling almost 500 mph headed toward Washington. [USA Today, 8/13/2002] In yet another account, Danielle O’Brien, the Dulles flight controller said to be the first to spot the blip, claims she doesn’t spot it until it is around 12 to 14 miles from Washington. [ABC News, 10/24/2001; ABC News, 10/24/2001] There are also accounts that Vice President Cheney is told around 9:27 a.m. that radar is tracking Flight 77, 50 miles away from Washington. The 9/11 Commission says the plane isn’t discovered until 9:32 a.m.\n(9:30 a.m.): Who Warns Who of Flight 77’s Impending Approach to D.C.?\nThe three Langley fighters are airborne, but just where they go and how fast are in dispute. There are varying accounts that the fighters are ordered to Washington, New York, Baltimore, or no destination at all. The 9/11 Commission Reports that, in fact, the pilots don’t understand there is an emergency and head east. They give three reasons. “First, unlike a normal scramble order, this order did not include a distance to the target, or the target’s location. Second, a ‘generic’ flight plan incorrectly led the Langley fighters to believe they were ordered to fly due east (090) for 60 miles. The purpose of the generic flight plan was to quickly get the aircraft airborne and out of local airspace. Third, the lead pilot and local FAA controller incorrectly assumed the flight plan instruction to go ‘090 for 60’ was newer guidance that superseded the original scramble order.” [9/11 Commission, 6/17/2004] However, the Wall Street Journal gives a different explanation, surprisingly from 9/11 Commission testimony. “Once they got in the air, the Langley fighters observed peacetime noise restrictions requiring that they fly more slowly than supersonic speed and takeoff over water, pointed away from Washington, according to testimony before the [9/11 Commission].” The fighters that departed to New York City over 30 minutes earlier at 8:52 a.m. (see 8:52 a.m.) traveled faster than supersonic because they realized they were in a national emergency. [Wall Street Journal, 3/22/2004 ] In 2003 testimony, NORAD Commander Major General Larry Arnold explains that the fighters head over the ocean because NORAD is “looking outward” and has to have clearance to fly over land. [9/11 Commission, 5/23/2003] One of the Langley pilots, Craig Borgstrom, later says that after taking off, “They (NEADS) [are] giving us the heading and altitude of north-northeast up to 20,000 feet. Then shortly after takeoff they changed our heading more north-westerly and gave us max-subsonic. That’s as fast as you can go without breaking the sound barrier.” Reportedly, the Langley fighters are now being vectored toward Washington, instead of New York. [Filson, 2004, pp. 63-65] Yet, in contrast to these accounts, the BBC reports that just before takeoff at 9:24 a.m., the pilots are specifically told that Flight 77 may have been hijacked, and they get a cockpit signal indicating they are in an emergency wartime situation (see (9:24 a.m.)). All the above accounts concur that, for whatever reason, the fighters go too far east. They don’t reach Washington until roughly around 10:00 a.m.\nAccording to the 9/11 Commission, NEADS contacts Washington flight control to ask about Flight 11. A manager there happens to mention, “We’re looking—we also lost American 77.” The commission claims, “This was the first notice to the military that American 77 was missing, and it had come by chance. ... No one at FAA Command Center or headquarters ever asked for military assistance with American 77.” [9/11 Commission, 6/17/2004] Yet, 38 minutes earlier, flight controllers determined Flight 77 was off course, out of radio contact, and had no transponder signal (see (8:56 a.m.)). They’d warned American Airlines headquarters within minutes. By some accounts, this is the first time NORAD is told about Flight 77, but other accounts have them warned around 9:25 a.m.\nColin Scoggins at Boston flight control calls NEADS to report a low-flying airliner he has spotted six miles southeast of the White House. He can offer no details regarding its identity. The plane is reportedly Flight 77, but as it has its transponder turned off, no one realizes this at the time. The news of the plane “sets off a frenzy.” Major Kevin Nasypany orders Major James Fox, head of the NEADS Weapons Team, “Get your fighters there as soon as possible!” Staff Sergeant William Huckabone says, “Ma’am, we are going AFIO [emergency military control of the fighters] right now with Quit 2-5 [the Langley fighters]. They are going direct Washington.” [Vanity Fair, 8/1/2006] The Langley fighters will arrive over Washington some time around 10 a.m. (see (9:55-10:15 a.m.)).\nFrom the 9/11 Report...\nThere is conflicting evidence about when the Vice President arrived in the shelter conference room. We have concluded, from the available evidence, that the Vice President arrived in the room shortly before 10:00, perhaps at 9:58.\n15 references to \"Cheney\" in that little \"timeline\" I posted. Mostly having to do with what he knew, when he knew it, and at what time he arrived at the PEOC.\nOf course, the timeline ends at 9:37am. Dick Cheney didn't arrive at the PEOC for another 18 minutes. At least according to the 9/11 Report.\nWant to explain to me why this man deserves a \"free pass\" in regards to his actions on 9/11, and why he doesn't have to explain them publicly, or under oath?\ndidn't richard clarke and mineta both contradict Cheney's story?\nOne eyewitness account, David Bohrer, a White House photographer, says Cheney leaves for the PEOC just after 9:00 a.m.\nJon Gold on Fri, 11/03/2006 - 11:32am.\nHe seems the type who would be willing to take a leading role, knowing that people are probably more afraid of him than anyone else in Bushworld. This is a guy who has no problem telling people to go f themselves on the floor of congress, and shoots his own friends. who would want to mess with him?\n....our exalted Air Defense can't protect the PENTAGON from a COMMERCIAL airliner 34 minutes after the 2nd Tower is hit in exactly the same manner? With Andrews AFB just 10 miles away? It's ludicrous. Insultingly ludicrous.\nAlso, how is it possible they \"lost\" track of 77's flight path? It's not. There is no explanation given for that, is there? WTF.\naltruist on Fri, 11/03/2006 - 10:31am.\nJon Gold: \"Stuff like that.\"\nToo funny. And no one has answers for all you bring up.\nThe_Mineta _Tes... on Fri, 11/03/2006 - 12:51pm.\nThe so-called new video won't show a thing, you can bet on it. To me the whole Pentagon thing is just a distraction. Lets stick with what we can prove like the controlled demolition of WTC7 and the impossibility of the \"Official Story.\"\nmaddog on Thu, 11/02/2006 - 8:41pm.\nThere were eyewitness reports saying there was another plane in the sky flying low over Washington after the explosion at the Pentagon.... it would make sense that the plane simply flew over the top of the Pentagon as a missle hit..... all the witnesses would be focused on the explosion and just assume it were the plane that crashed.\nJJames (not verified) on Fri, 11/03/2006 - 9:19am.\nand we saw it in loose change in the spanish language footage...\nThe Pentagon is a huge smoking gun!\n·\tHani Hanjour could not have flown back from the Kentucky/Ohio border to D.C.\n·\tHanjour could have not made that incredible maneuver in a B-757 to hit the renovated section of the Pentagon.\n·\tA B-757 would NOT make a hole the size of a missile in the Pentagon.\n·\t84 videos of whatever hit the Pentagon are being withheld for no damn good reason except to cover-up. The only video released looks like an A3 Skywarrior!\n·\tThere is no way they could ID a planeload of people that smashed through the Pentagon @ 530 mph. The fictitious DNA results were made to bolster the official story.\ninto the building\" according to this old USA Today article, why won't they release the information to us???\nAnonymous (not verified) on Fri, 11/03/2006 - 11:31am.\nWhy don't they release the pentagon security camera footage and the DC Traffic Control footage and all the video available so that a true picture may be drawn. Seeing a large jet fly low towards the pentago from the perspective of a restaurant is fine, but when all other angles and vantages are available, what could possibly be the reason for not allowing the big picture to be seen. As well, the pentagon still is not the center of all things suspicious and damning. Let's talk about demolition.\ngreenback (not verified) on Thu, 11/02/2006 - 8:42pm.\nI know Alex Jones and others says that the Pentagon could be a 'honeypot', I don't think I agree like I usually do with him.\nI think if it was, it would of came out with the popularity of Loose Change and totally discredited the entire video (in the mainstream view).\nI'm 99% sure that this video will barely show what happened...why else would the FBI release it now?\n\nIf it does I hope people can see the CGI and how they spliced it.\nHere's the deal . . . Gone with the Wind didn't take this much time to produce and it was on celluloid and hand spliced. Most security cameras are time-domain multiplexed. At the NTSC frame rate of 30-frames-per-second, recording equipment divides the total cameras by this frame rate. For instance ten (10) camers, 1/30th of the time is dedicated or 3 frames per second. This is why most security cameras typically appear like a sequence of time-lapse still frames.\nNow, let's say we see a video that runs at the full frame rate. I'd have ask is it possible that the hotel had only one camera to one recording device, thus the full frame rate?\nI hope the truth \"is\" shown and this does not look like a hi-def full frame-rate production, again I'd really have questions if this be the case.\nIf I recall correctly, Tom Hanks shook hands with President Kennedy in the movie Forest Gump - we all know that looked pretty convincing - more developed technology today. Just a personal observation - hoping for the best.\nin_Holloywood (not verified) on Thu, 11/02/2006 - 8:26pm.\nIt will show a fuzzier shot than what is depicted here. Thats a quality film camera photo, not a webcam or surveillance cam photo.\nAnd with a perspective as such it would be easy to superimpose a airliner in to the video since we all know a 767 is bigger than a skywarrior or global hawk. There is no way a 767 hit the pentagon. There are TWO engines, one hole. Argument over.\nilluminating911 (not verified) on Thu, 11/02/2006 - 7:58pm.\nIf it shows a 757,which i doubt.Has there been enough time to make a good fake?\nOr just add a little blue,red and silver paint to the blur to convince the sheepel.\nWISDOM on Fri, 11/03/2006 - 9:57am.\nWhat difference will it make? You twoofers see what you want to see. If the video is low quality, you'll say it was a \"missile\". If it is high quality, you'll claim it was doctored. Your dopey theory isn't based on evidence, so facts are immaterial. You believe America attacked itself because you believe America is evil.\nRealWorld (not verified) on Fri, 11/03/2006 - 8:17pm.\n\"You believe America attacked itself because you believe America is evil,\" sounds like it came right out of Bill O'reilly's mouth on Fox News. This is the typical disinginuous twisting and misrepresenting of any and all criticisms anybody brings forth against the Bush Fascist Regime. Well, you may have memorized the right wing's talking points, but you have not offered any intelligent arguments to refute the \"twoofers\", whatever that means.\nFirst of all, we don't believe that, \"America attacked itself.\" That is the most assinine thing anybody could possibly say. No, we believe that the Bush Fascists attacked America. The Bush Fascist cabal is not America, they attacked America.\nSecond of all, we don't believe \"America is evil.\" That is yet the same assinine distortion of our position. No, we believe that the Bush Fascists are evil. \nIf \"RealWorld\" is not receiving a paycheck from some unstated 3-lettered agency, \"RealWorld\" sure acts like an imposter of one.\nKeenan on Fri, 11/03/2006 - 9:33pm.\nHey Keenan, where are the Democrats and the anti-Bush media outlets like the New York Times or the Nation? How come they aren’t endorsing your America-hating crackpot drivel? These ludicrous theories would have taken the active participation of thousands and you and your fellow twoofers slander many good people. \nThree letter agency? \nRealWorld (not verified) on Fri, 11/03/2006 - 11:32pm.\n\nYou're like one of those wind-up Bill O'reilly dolls, just the same talking points over and over and over and over again...\n\"you hate America!...America-hating crackput drivel!...you don't have one iota of evidence!...you're saying America attacked itself!...you just believe America is evil!...you have no evidence!...terrorists!...hate!...freedom hating!...\nCareful, don't have an O'reilly spasmatic attack. What version of the Bill O'reilly chip did you install? You might need an upgrade.\nKeenan on Fri, 11/03/2006 - 11:57pm.\nRemember when the Twoof Movement was so sure of itself that the FBI conficated a security tape from the Sheration Nation Hotel, and that the video captured crystal-clear images of the object that hit the Pentagon?\nGuess you guys just decided to quitely brush that one under the rug, huh?\n\"The top photo [in the above link] shows the view from inside the hotel. On the diagram below (courtesy of Google Maps) it is obvious that they could not have recorded the impact since it was around the corner of the building. But depending on their camera locations they may have been able to record the flight path from various angles since the aircraft would have essentially looped around the building.\"\nFerric Oxide (not verified) on Thu, 11/02/2006 - 6:40pm.\nWhere is evidence of Flight 77 hitting pentagon?\nNormally I dismiss people who think F77 hit the pentagon as believers in the official story. Outside of the \"no-planers\" in regards to the WTC, there seems to be a breed of \"yes-planers\" in regards to Flight 77 hitting the pentagon, which breeds life within the 9/11 truth movement. Given the amount of evidence that F77 did not hit pentagon, such as, hole too small on pentagon wall (before outside wall collapse), and lack of significant debris, what evidence is there that points to F77 hitting Pentagon?\n911Truth (not verified) on Thu, 11/02/2006 - 6:41pm.\nThe \"yes-planers\" at the Pentagon group are composed of either A: a small but vocal bunch of disinfo agents such as Jim Hoffman, John Judge, and Mark Robinowitz at oilempire.us. or B: people who are dupes of the above. The fact is that there is no evidence so support the OGCT (Official Government Conspiracy Theory) that fllight 77 hit the pentagon, so instead, the agents generally resort to PSYOP tactics, such as ridicule and attempts to discredit the messenger. \"The argument that flight 77 didn't hit the pentagon is a waste of time,\" for example.\nAll the physical evidence proves that flight 77 did not hit the Pentagon. Disinfo agents such as John Judge and Jim Hoffman never discuss this evidence. The \"yes-planers\" only point to eyewitness testimony, which does not hold as much weight as physical evidence, which any experienced investigator will tell you. Most of the eye-wittnesses are not credible, being that they are mostly military and msm employees. How did 10 USA Today employees wind up in the right place at the right time to \"witness\" flight 77 hitting the Pentagon? What are the odds of that being possible?\nhttp://www.davesweb.cnchost.com/nwsltr68e.html to take a look at the \"witnesses\" to see just who they really are.\nSerious Disinfo Here!!!!\nI am part of a 9/11 Truth group on the west coast. I know many people who have left the San Fransisco 9/11 group because people have told me the group is guided by Jim Hoffmans theory that Flight 77 hit the pentagon, and apparently the group does not tolerate anyone who thinks otherwise. People got disgusted with this practice, and many left to either form a new group or do individual research.\nIn regards to the group I am involved in, I have had a couple people attack me specifically on the Flight 77 issue, but not on any other issue. Folks, I'm sorry but there is some serious disinfo going on with the pentagon situation. Now we do not know what for sure what exactly hit the pentagon, but it should not be hard to figure out that there is plenty more evidence to show that F77 did not hit the pentagaon than there is evidence to show that it did. That's a no-brainer.\n911Truth (not verified) on Fri, 11/03/2006 - 12:36pm.\npassengers at the Pentagon! The enormous forces + the explosion & fire, would've certainly destroyed any DNA that had been present!\nSo if the airplane really hit the Pentagon, why did they then feel the need to fabricate DNA results???\nJon Gold shows his true Aaaaaaagent colors...\nI always had my suspicions about Jon Gold, but his comments in this discussion thread really confirmed it for me. For one thing, the fact that Jon Gold uses known liar John Judge to reference his support of the official government fiction that flight 77 hit the Pentagon is kind of a dead giveaway. Besides John Judge's ridiculous fairy tale about the amazing and mysterious AA flight attendent/fellow conspiracy researcher/friend/rescue worker/expert plane parts identifier who \"proved\" that flight 77 hit the Pentagon, John Judge has also been busy inventing ridiculous fairy tales to support the official government fiction that the plane crashes and ensuing fires is what brought down the 3 WTC towers...\nYES, IN CASE ANYBODY MISSED IT, JOHN JUDGE REJECTS THE CONTROLLED DEMOLITION THEORY OF THE 3 WTC TOWERS - EVEN BUILDING 7! Why on earth would Jon Gold risk his reputation by siding with such a professional liar?\nSecond of all, Jon Gold uses typical PSYOP type arguments - attacking the messenger rather than talking about the evidence that the messenger is presenting. His one line comment above is the typical example. When a PSYOP agent has no valid arguments against the evidence that contradicts an official government lie (such as regarding the Pentagon attack), they resort to BADJACKETING - which is to accuse the messenger of being an agent in front of other people in the movement. BADJACKETING has long been the favorite tactic of COINTELPRO for the purposes of attempting to discredit someone in the movement in the eyes of other people in the movement.\nSo when Jon Gold responds to someone's argument by saying only, \"you're an agent\" without offering any valid reasons, and without offeriing any counter-evidence, he is just showing his true colors. It gets tiresome when he does this over and over again.\nYou're saying John gets paid to lie? Ever see his car?\nJon Gold on Fri, 11/03/2006 - 10:33pm.\nWhat does his car prove?\nSo, Jon, how could you possibly support someone like John Judge when he even lies about he collapse of the WTC towers? You really need to address this. Talking about John's car is just smoke and mirrors to distract us away from the uncomfortable fact that by referencing and supporting a known liar, your credibility comes into question.\nThe only thing I have to address are envelopes. He lies, or he has a difference of opinion? Or are you forgetting that according to the Scripps Howard poll, only \"16 percent of Americans speculate that secretly planted explosives, not burning passenger jets, were the real reason the massive twin towers of the World Trade Center collapsed?\"\nHe is in the remaining 84% that don't believe the buildings were brought down by Controlled Demolition, however, I have spoken to him on several ocassions, and he most certainly believes elements of our Government were complicit in the attacks.\nI have a good relationship with Professor Jones, and Dr. Griffin. I have posted all of their information. I have promoted every single movie that has come out with regard to Controlled Demolition.\nConsidering all that I've sacrificed, and all that I've done for this movement, my \"credibility\" should only be doubted by those who seek to destroy it.\nThe fact that I even have people trying to destroy it tells me that I must be doing something right.\nJon Gold on Sat, 11/04/2006 - 12:02am.\nJohn Judge does not simply \"believe\" with the 84% of the population who haven't looked into the evidence of controlled demolition. Rather, John presents himself as a researcher on 9/11. He HAS looked into the evidence, but he does not address it. Instead, he manufactures absurd and quite elaborate fabrications to uphold the official gov't fiction of how the buildings collapsed. For example, he said that building 7 was structurally damaged by 1.5 richter quakes that occured when the planes hit the twin towers. Obviously, anybody with, like, an IQ higher than 80 would not believe such nonsense and we know that John's IQ is higher than 80. He tells lie after lie - such as claiming that the bottom 10 stories of building 7 was somehow cut out 1/3 into the building - a blatent lie. If that were true, several central columns would have been cut out, which we know did not happen. John fabricates this without offering a shred of evidence. Besides, he knows damn well that even if that were true, building 7 would not have fallen symmetrically within its own footprint in less than 7 seconds. This list of absurd lies that John fabricates to support the official collapse theory is endless and there is no way that an experienced researcher like John could possibly believe such nonsense that he spews. He is very clever. Again, it is not about disagreement, John goes out of his way to lie and fabricate elaborate fictions to add to the official story, then he goes out of his way to disparage good 9/11 researchers who have proven that controlled demolition was used. That is the obvious behavior of PSYOPS agents.\nAnd don't forget Judge's ridiculous fable about his friend/fellow JFK conspiracy researcher/AA flight attendent/rescue worker/high-clearence Pentagon volunteer/expert plane parts identifier story that becomes more absurd and outrageous with every new version he tales. Yet again, John Judge does not address the physical evidence but can only see fit to fabricate elaborate and clever nonsense to support the official government story, like a true PSYOPS agent.\n\"I have spoken to him on several ocassions, and he most certainly believes elements of our Government were complicit in the attacks.\"\nWell obviously, he has to say that! That's how it works. How else can someone infiltrate the 9/11 movement in order to poisen and attempt to confuse and divide the movement? By saying that it was NOT an inside job? It's a no brainer that any disinfo agent has to at least pretend to be a 9/11 truther.\n\"I have a good relationship with Professor Jones, and Dr. Griffin\"\nThen why to you attack and ridicule them and everybody else who does not accept the Boeing at the Pentagon official fiction? The overwhelming consensus among the credible 9/11 researchers, including the Schollars for 9/11 Truth, Barrie Zwicker, Webster Tarpley, SPINE (Scientific Panel Investigating Nine Eleven), Jim Marrs, etc., is that the physical evidence scientifically disproves that flight 77 could have hit the Pentagon, and they have published their research to back it up. Who does that leave? Lets's see...only Jim Hoffman. And Jim Hoffman has not published anything to support his position that flight 77 hit the Pentagon, he only engages in PSYOP-type arguments basically ridiculing and attacking everybody else. That is why Hoffman hasn't been invited to any recent 9/11 conferences. People are tired of his divisive and destructive behavior towards other good 9/11 researchers.\n\n\nSorry, but if I have to choose between Dr. Griffin, Professor Jones, Jim Fetzer, Kevin Barret, most of Schollars of 9/11 Truth, SPINE, Barrie Zwicker, Webster Tarpley, Jim Marrs, etc., on one side, and a tiny clique consisting of Jon Gold, Jim Hoffman, John Judge on the other, common sense forces me to go with the former. If you act like a PSYOP agent, then you have to expect that people will see you as one.\nKeenan on Sat, 11/04/2006 - 1:33am.\nWhen I do, you'll be the first one I tell.\nIncidentally, are you sure you're not thinking of Nico Haupt?\nTell me why the stats suggest that my posts are the most popular on this site. \nNice try, but everybody can see that you still refuse to address my points, and those of all the good 9/11 researchers that you attack with your Pentagon disinfo nonsense. Playing popularity contest is just more smoke and mirrors. All you can do is respond in typical PSYOPS fashion, yet again...still waiting for you to address my points...\nYou are simply proving over and over again that you do not have any valid arguments.\nI have never...\n\"Attacked\" any of the individuals you cited, and I am friends with several of them.\nAs far as my \"Pentagon disinfo nonsense\", you mean this, or this?\nIt's interesting that your first posts on this site are to attack me.\nDid Nico send you?\nI'm off to take part in some 9/11 Truth activism.\nWhen's the last time you did the same?\nYou have consistently attacked anybody on this forum who does not accept the official government fiction of flight 77 hitting the Pentagon. You typically use arguments such as, \"the no Boeing at the Pentagon theorists are disinfo agents\", or \"arguing that flight 77 didn't hit the Pentagon is a waste of time\". By implication, you are therefore attacking all the good 9/11 researchers who do not accept the official myth of flight 77 hitting the Pentagon, including your friends Professor Jones, David Griffin, Kevin Barret, James Fetzer and most of the Schollars for 9/11 Truth, Webster Tarpey, Barrie Zwicker, Jim Marrs, Dylan Avery, Jasen Bermes and the folks at Louder than Words, SPINE, Dave Von Kleist, Eric Hufschmid, etc., and all the folks on this forum who also question the official myth of the Pentagon attack.\n\"I'm off to take part in some 9/11 Truth activism.\nWhen's the last time you did the same?\"\nYet again, more smoke and mirros to deflect attention away from the issue you still refuse to address, which is why are you trying to destroy so many good researchers in the 9/11 movement with your Pentagon disinfo crap?\nAt this point, I realize that you are not going to address the real issue because you don't have any valid arguments to dispute what all the above-named researchers have put on the table, all you can do is attack. Therefore, I'm not going to waste any more time with you. You have been exposed for all to see what your true purposes are.\nKeenan on Sat, 11/04/2006 - 12:56pm.\nThat giving advice to individuals, based on my experience, to focus on different aspects of the Pentagon, is \"attacking\" them? As far as calling people \"disinfo\", there ARE \"disinformationists\" out there. Some have frequented this site. I know for a fact that I'm not one of them.\nIn regards to John Judge, I created this thread to try and find out why people don't like him. I have always found him to be a tremendous resource. He is a walking encyclopedia of information, and anyone who's been around a long time knows that. I thought it would be interesting to get his perspective on different issues.\nJust as I would like to hear what Michael Parenti has to say. Just as I would like to hear what Gore Vidal has to say. Just as I would like to hear what Ray McGovern has to say.\nJust because you have a difference of opinion about a certain aspect of 9/11, doesn't mean you have to be bitter enemies. There are individuals out there who try to make it this way, but that's not how it's supposed to be. To me, as long as you make an honest effort at getting people involved, without hurting the movement, then you're fine by me.\nFeel free to ask Prof. Jones, David Griffin, Kevin Barrett, Dylan Avery, Jason Bermas if they think I've ever \"attacked\" them. Jim Fetzer and I have had a few disagreements. Barrie Zwicker and I have only corresponded a few times. I don't know Jim Marrs, the people at SPINE, or Dave Von Kleist. Eric Hufschmid and I used to be friends. We aren't any more.\nTell me... Have I ever been an individual with a history of causing disruption within this movement? An individual who starts email campaigns directed against certain individuals in this movement? An individual who writes denouncing articles against certain organizations within this movement? An individual who writes denouncing articles about certain events that take place within this movement? An individual who keeps tabs on what certain individuals within this movement are saying so that he can use their statements against them in the future? An individual who makes sure to act like a lunatic when the mainstream media decides to give this movement some attention? An individual who takes it upon himself to \"investigate\" certain individuals within this movement in the hopes of discrediting them?\nThe answer to that long list of questions is no.\nSome people have. Do you condone that kind of behavior?\nAlso, tell me why you think it's ok to promote a missile or an A-3 Sky Warrior hitting the Pentagon on National Television where all a pundit has to do to make you look like a fool is ask, \"Then where are the passengers?\"\nI don't like the fact that they have the ability to do that. Therefore, I try to ask that different aspects are promoted.\nJon Gold on Sat, 11/04/2006 - 10:57pm.\nvirtually every single eyewitness who claimed to see Flight 77 hit the Pentagon either worked for USA Today(they have been connected for years to the agency. but dont ask me, just google USA Today-CIA connections) or the government. im just saying............\nWho told me their grandparents live in a building within view of the Pentagon, and that they saw an explosion but no plane. Granted, they are old folk, but there it is. Add my eyewitnesses to the pile of contradictory ones, fwiw!\nthis is the problem with the 9/11 truth movement, there are so many diverse theories, i'm not nocking anyone it's just when they release a video of a plane clearly hitting the pentagon it could have a profound effect on the truth movvment.\nAnders (not verified) on Thu, 11/02/2006 - 6:42pm.\ndoesn't make sense (as i type this post) to release it after the election. makes you wonder what's up.\nwhat is the reason this couldn't just be released with the Citgo video? granted I don't know all the red tape procedural stuff that comes with something like this, but they're both security videos from private businesses, why is this one taking longer to reach us? i'm not suggesting something untoward, but it just really doesn't make much sense.\nto get it JUST RIGHT.\nilluminating9_11 (not verified) on Thu, 11/02/2006 - 8:00pm.\nI think they had time and this is more of an election thing. watch them pretend that whatever they release \"puts the conspiracy theories to rest.\"\nyep, the truth movement is ïn its last throes.\" we can only hope, given what that phrase seems to mean in Iraq!!\nwhatever it shows it won't reveal any objective truth on the matter.\nThe FBI or whoever has had it all this time and can make it show whatever. Most likely it won't have been manipulated, it will simply show an ambiguous object hitting the pentagon, or only the explosion and smoke will be visible. If that plane does show up on the video it will consist of perhaps 8 pixels to argue about what it was or wasn't, or what it's trajectory is or is not. The fact that the pentagon issue cannot be precisely determined (like the rest of 9/11) only indicates it's most likely total BS. I agree with Jim Fetzer and the whole mindset that if anything on that day happened how the OTC claims it should be fairly obvious. Every day that goes by leaving these matters which should be scientifically verifiable to a matter of faith, only further cements my knowledge that this whole thing is a lie. I mean comeon people, at this point, how much more obvious can it get? The sheer amount of time these questions and contradictions have hung in the air without being resolved is a proof itself. The air is thick with stale lies and it's really starting to stink to where more and more people are noticing.\nUntil some kind of real investigation takes place nothing can really be verified. And these things (false flag operations) are set up in such a way to leave evidence appearing one way when the opposite is true. I sure hope we're not still talking about evidence about what did and didn't happen 5 more years from now. And who knows the kind of myth and folklore and further divide 9/11 will come to represent 10 and 15 more years from now. Will it pass to become another JFK? An event rife with contradiction and speculation, where most of the public knows it's BS, and yet the government will always stick to it's own history, a history that it wrote?\nResearcher488 (not verified) on Thu, 11/02/2006 - 6:59pm.\n\"please note that we still have a second FOIA request active for the other 84 flight 77 recordings. we WILL get those as well. they should include the citgo tape and the doubletree hotel tape (there's no sheraton hotel tape, BTW)\"\nhave nothing to say about this upcoming release?\nwhen the gas station tape came out...\nWhere did the number 84 come from (in terms of pentagon attack videos out there)?\nscreaminaj (not verified) on Thu, 11/02/2006 - 8:01pm.\n\"September 9, 2005: Special Agent Jacqueline Maguire of the FBI's Counterterrorism Division files a DECLARATION describing her search for records responsive to Bingham's FOIA request. Maguire admits to determining that 85 videotapes in the FBI's possession are \"potentially responsive\" the the request, that she personally viewed 29 of the tapes, and that she located only one videotape that showed the impact of Flight 77 into the Pentagon. Maguire also refers to \"one videotape taken from a closed circuit television at a Doubletree Hotel in Arlington Virginia,\" but states that it did not show the impact of Flight 77.\nSeptember 26, 2005: Hodes files a request seeking \"copies of 85 videotapes in the possession of the FBI described in the declaration of Special Agent Jacqueline Maguire dated September 7, 2005.\"\nstallion4 on Thu, 11/02/2006 - 9:30pm.\nif we do see a \"Fly By\" in the upcoming release, i'll believe even more strongly in the \"Fly Over\" theory. credible eyewitnesses saw the plane fly towards the Pentagon fromthe left of the Citgo gas station, the FDR recently released by the NTSB shows almost the same thing, the evacuation of Reagan Airport, the other white plane over DC. its a perfect deception really.\nboast on Thu, 11/02/2006 - 7:32pm.\nhow low did the 757 fly?\nIf they have a 757 video, then they also have to show us how it made this ground-level hole. Someone made a 3d reconstruction of how it was possible for the 757 to disappear in the hole. But this model only went to show how unlikely it all is - almost like the \"magic bullet theory\" of JFK, nothing is really \"impossible\".\nSadly, i assume the video will once again be like the gas station video, something that shows almost nothing. And even if a plane is seen on it, remember the official version had a military C130 flying behind the 757 and the pilot seeing it crash (the same magic C130 that appeared in Shanksville later).\nGreg (not verified) on Thu, 11/02/2006 - 8:43pm.\nhappen to push the envelope a little bit??? It's getting sort of annoying.\nAre you talking about the up/down arrows?\nOr the stars that go with the blog posts?\nI find the up/down arrows helpful in that I sometimes just want to note that I agree, or that the post was a good one without adding a post that says next to nothing -- like \"Good post!\" Also, there are certainly times when I want to express disagreement without contributing to a thread. I don't think I have an ax to grind, though some may disagree.\nGiven that only registered users can vote on posts, what would make you think a really dumb one would get a lot? Ten votes in either direction happens pretty infrequently -- which means most people don't vote, most of the time.\ncasseia on Fri, 11/03/2006 - 1:26am.\nand the site owners should be able to detect abuse... normally I think they tell us a lot about how strongloy people feel how about which issues.\nhaha, democracy in action man. the people have spoken by giving casseia 5 and you -5. dont take it personally, ive gotten my share of negative posts before myself. i like the point system because it saves me from going on rants or getting in arguments. i can just give somebody a negative point and move on if i dont feel like getting into it. my 2 cents.\nsure, i think its consistent. you got a -7 on your post because obviously more people here like the points system than dont like it. thats why casseias post got almost the mirror opposite in positive points. same thing with the Jon Gold-Keenan exchange. i would guess that more people on this site(at least people who voted) think that its more likely Flight 77 didnt hit the Pentagon than did. and Jon may be popular with many people here, but there are some who dont appreciate how he and others like John A. compare the no Flight 77 theory with CGI and holograms and make little snide remarks about how speculating about Flight 77 \"hurts the movement\" and causes the media to slander and ridicule as, as if they wouldnt do that anyway. both Jons do great work, but they likely lose some points by ridiculing CD and Pentagon theories regularly. just a guess. and for the record Andrew, i typically dont give your posts thumbs down, in fact, the one you have negative 7 on right now i only clicked down becuase im one of the ones who likes the point system. i typically agree with what you say, and agree that you have the right to talk about CGI or anything else. recently when there was discussion about banning no-planers i voiced my opinion that we should not, despite being very much against that theory. so basically, i think the point system is useful.\nwhich quotes were you referring too? I only give minus to anyone who agrees with the official story or says it wasn't an inside job because I'm 100% confirmed that it was. And I was a supporter of Bush and the war for a few years!\ncard51short on Thu, 11/02/2006 - 10:35pm.\nThe article stresses the point that the employees watched them multiple times - who were the employees and what do they say they saw?\nTrollosaurus (not verified) on Thu, 11/02/2006 - 8:00pm.\neffects in the manipulated video are real, and that they did see the same thing themselves.\nhave any credibility whatsoever?\nAny CG enthusiasts?\nSomeone needs to bring out a couple of CG examples of similar things to show how easy it is to fake these things, before the Officials do the same thing.\nPreempt this stuff, guys.\nPreempt this stuff, guys. Just say that clear video is coming out in the next week of the Pnetagon hit. If it is anything less, alternative theorists look good.\nTed Lover. (not verified) on Thu, 11/02/2006 - 11:47pm.\nAccording to Special Agent Jacqueline Maguire of the FBI's Counterterrorism Division, the Doubletree Hotel videotape \"did not show the impact of Flight 77.\"\nIf what they release on the 9th (Doubletree Hotel video) shows flight 77 impacting the Pentagon, we know it's lie (according to what the FBI has already admitted).\nstallion4 on Fri, 11/03/2006 - 12:02am.\ngreat post stallion! Either way, they are liars!\nBut I already know that it won't be anything significant.\ncard51short on Fri, 11/03/2006 - 12:40am.\nI hope ithe video has a good view of the object that hit the pentagon. Hopefully this will show what it was and put an end to this chapter in the 9/11 book. The other recently released footage didnt show jack. I still think it was a jet airliner but until we see some good footage its technically an unidentified flying object. My vote goes to commercial airplane.\nAndy White on Fri, 11/03/2006 - 2:05am.\nOf course, just for the sake of posibilities, what if the new tapes show something other than flight 77.\nTruth for a Change on Fri, 11/03/2006 - 4:00am.\nI'm pretty sure that's the only option completely outside the realm of possibility at this point. If there was anything at all on that tape that could in any way damage the official conspiracy theory, the Feds just would have lost the tape, burnt it by accident or shredded it and thrown it away (in several trash cans in several locations, preferably). If you haven't noticed, evidence has been blatantly destroyed throughout this painful process, in plain sight of anyone who cares to look.\nI'm with the guy who said we'll at most see a few distorted pixels of somethingness and the explosion. And the Hannitys and O'reillys of this world will feel totally vindicated. Og, it must feel good being them, or suffering from the same illness... Ignorance must truly be bliss.\nGisli (not verified) on Fri, 11/03/2006 - 4:24am.\nWhy be afraid of Pentagon NPT?\nEven if they released a sophisticated piece of CG imagery, there'd still be the fact that the entry hole is way too small and the \"exit hole\" is way too big and way too far in (10ft of reinforced concrete, I mean WTF). Matter of fact is that the physical evidence does not match the official narrative. And thus, there's nothing wrong with calling the Pentagon issue to everyone's attention...it's a truly big stinking fish, and no amount of special fx high gloss envelope will ever change that. Unless we'll all be lobotomized, of course.\nSo in short, don't fear the Pentagon NPT, the facts speak for themselves. If someone had an inkling that the PTB might release footage of a hellish inferno engulfing WTC7's south face, would you stop bringing its controlled demolition up?\nbruce away from home (not verified) on Fri, 11/03/2006 - 5:36am.\ndoes nobody else even consider that this will be a really good film....if i was them i woulda kidnapped the best techy in hollywood and forced him to sit in chair until he produces a fake video capable of convincing the unconvinced! dont underestimate their intelligence and deviousness.\nThe chief suspect holds all the \"evidence\" and gets to release bits and pieces of said \"evidence\" a politically opportune times, if at all. The Ben Laden tapes strongly appear to have been faked and the cell phone calls from the planes also appear to be faked. Why would any released photos be accepted as unaltered true and correct photos?\nEven when and if we see photo of a \"plane\" hitting the Pentagon, we still do not know what it was that \"hit\" the Pentagon, much less if the photo is faked.\nThe chief suspect(s) are murderous treasonous despots and evil thugs.\nThis story is much ado about nothing.\nOur opinions and positions on so many issues are so similar - actually identical - that people are going to begin to wonder why they have never seen us in the same room together.\nYour analysis of the Pentagon brew-ha-ha is quite lucid.\nJohn Albanese on Fri, 11/03/2006 - 9:59am.\nWe must share the same brain.\nAbby Scott thought I was you.\nKarl Rove promised something spectacular for October, but it makes better sense to wait until Monday...however the computer technology available today can make anything look any way you want it to. My guess is, it'll be a doctored fake. Good, but faked.\nThat still won't explain the perfectly round cirlcle of impact though.\nMaybe it wil show...\n...Hani Hanjour waving to the camera as the plane passes by....\nMy guess is that the explosion will obscure the impact.... there will not be full penetration prior to an explosion.\nI think that we were given a hint though.... the FOIA request mentions the videos showing an impact with the Pentagon..... the person reviewing the videos said that there were only so many videos showing that.... perhaps we simply need to request the videos showing a plane flying over the pentagon.\nJJames (not verified) on Fri, 11/03/2006 - 10:47am.\nIt will either be something totally unhelpful like the Citgo tape, just so people can say they're not hiding anything (except all the PENTAGON surveillance camera tapes) or a decent fake to \"put it all to rest\" except of course for the LIHOPPERS who will harp on about warnings and missed chances to kill bin Laden, yatta yatta.\nOf course thanks to Nico we'll have to choose a term other than \"video fakery\" to describe what they did. Video forgery might be a good alternative. It's almost like the NPT folks were setting us up for something like this, no?\nIn any case, just goes to show how important the demolitions are compared to the Pentagon. Larry Silverstein didn't sign the lease on the Pentagon 6 weeks before 9/11, after all!\ndz on Fri, 11/03/2006 - 11:48am.\nLucky for you I have a sense of humor. It's true though... John and I say a lot of the same things. Let's say for instance that there are 9/11 Truthers who tend to focus on things of a scientific nature. As inconceivable as that is... :P John and I lean more towards the other side of the spectrum. People's actions, contradicting statements, historical references, and stuff like that. I also think that he and I probably had a lot of the same influences.\nProbably is lucky for me....\n.....as I recall, someone I know who has met you said you were a pretty big guy...\nSeriously, I'm a big fan of what you and John do here....just having a wee bit of fun....\nI agree that the science can be over-emphasized....as Donald Sutherland's \"Mr. X\" in the movie JFK says (to paraphrase), what's really important is \"why\" (why 9/11)?.......\naltruist on Fri, 11/03/2006 - 7:21pm.\nThe \"debunkers\" like to make fun of my 'weight problem.\" I can't \"weight\" to eat.\nThe actions of this Government after the fact point to why.\nPentagon Video in the Nick of Time for Midterms?\nIf I was a betting man, I’d bet the farm on it.\nHowever, simply releasing a five plus year old video with a predictable corporate media broadside taking “conspiracy nuts” to task, thus questioning their patriotism and even sanity, and suggesting Republicans were right all along about those cave-dwelling Muslims with their boxcutters, may not be enough to flip the election and keep the House and Senate in neocon hands.\nAs Killtown points out, “four days before the [2004] elections, the first video of bin Laden publicly taking credit for 9/11 which boosted Bush’s poll ratings in Ohio which lead to him being re-elected.” Indeed, after the video was aired, according to the Telegraph, Bush “opened a six-point lead over John Kerry…. If the trend is confirmed by other polls, Mr Bush may have his greatest enemy to thank for helping him secure another four years in the White House after the appearance of the video sparked a sharp final round of argument over which candidate can best defeat terrorism.” \nIt would seem, as well, the Osama in the “October Surprise” video had plastic surgery, specifically to make sure he did not resemble the real Osama, who died nearly three years before the fake Osama went before the camera. Compare the Osamas on this page and decide for yourself.\nA blurry security camera video shot from atop the Doubletree Hotel will be a shade better than the ludicrous video frames released in May, a pathetic effort lauded by the corporate media as a bold finale putting to rest once and all conspiracy theories about what hit the Pentagon. No doubt, as well, this latest effort, promised by the FBI and the suspicious Judicial Watch to be released no later than November 9th, will be heralded as conclusive evidence the aerobatic Hani Hanjour, who had trouble controlling and landing the Cessna 172, executed a flawless, 500 miles per hour maneuver of a Boeing 757 into the Pentagon.\nI think the elites WANT democrats to win this election.\nIt will make people think they have \"demanded change\", when democrats instead give us the status quo. The media will no doubt be falling all over telling the world how the \"people have spoken\" as the dems \"take back\" the house and/or senate.\nDemocrats winning will put lots of people back to sleep, thinking that they've \"done all they can\" and we'll just have to \"wait for congress to do something\".\nIn the next few weeks the phony left/right paradigm will be exploited in the media like you've never seen before. Then we will get nothing, not a peep, out of the democrats for the next two years and beyond!\nkevin (not verified) on Fri, 11/03/2006 - 11:27am.\nright on, that is exactly what is going on... they're switching the folks at the top on us is all. MAYBE they'll try to then use a little LIHOP to defuse the truth movement. SEE? democrats are INVESTIGATING 9/11 - the loonies will NEVER be satisfied though. with a few LIHOPPERS from the movement cheering on the Dems, who knows they may have some success... but I doubt it. it may help them contain the truth for a little while longer until the 2008 october surprise, right befor McCain and... someone beat Hillary and Obama... or the other way around depending how things go. they think they cover all the bases--our job is to keep making new bases!!\nThere is no 2-party system in this country any longer!\nIt's a 1-party system, just the way the elites want it! No checks & balances!\nCould you ever image two leading Democrats, Kerry & Hillary, acting like such jellyfish in dire times like these? They should be ripping the Bushies to shreds, but there's hardly a peep out of them!\nGREEN PARTY candidates must start winning elections.\nDemocrats need to abandon this worthless spineless party and start supporting REAL candidates that stand for change.\nNunyabiz on Sat, 11/04/2006 - 1:13pm.\nThere are only two things related to the Pentagon on 9/11 that have potential for the 9/11 Truth Movement, and videos being released by the FBI ain't one of them. There is absolutely no way the government will ever release a video that shows a missile impacting the Pentagon on 9/11, so all speculation about these FOIA requests is pointless.\nTwo things are relevant: (1) What happened to the $2.6 trillion? (2) What were Cheney's \"orders\" as he watched a plane approach the Pentagon?\nThe No-Boeing theory will remain unproved.\nUm, how can the No-Being theory remain unproved when it was never unproved? The scientific evidence proves the No-Boing theory and you cannot dispute it, regardless of the \"eye-witness\" statements, most of whom are not credible, and much of it is contradictory anyway. The science of the physical evidence outweighs any eyewitness testimony - Just ask any experienced investigator. On top of that, you have the official lies and cover-up behavior (hiding the videos for example) - itself strong evidence that the government's version of what happened at the Pentagon is not true.\nThere really is no reasonable argument to support the government's fable that flight 77 hit the Pentagon theory. When you have both A: Physical evidence contradicting the official story and, B: Massive cover-up behavior, it's pretty much an open and shut case for any experienced investigator to conclude that the No-Boeing theory is the only reasonable one.\nAnyone who argues that the \"No-Boing\" theory is unscientific either has not done their homework, or has been duped by the small but loud clique of \"yes-planers\" such as John Judge, Jim Hoffman, or Mark Robinowitz, who's arguments pretty much consist of ridiculing those who do not accept the official Pentagon fable and accusing them of being disinfo agents.\nI happen to agree with Jim Hoffman. The No Boeing theory is full of holes, if you will, and represents a huge \"honey pot\" for skeptics.\nYou didn't provide any counter arguments. Just saying that the No Boeing theory is full of holes without providing any specifics is an argument which is full of holes itself. You are just behaving like a dupe of disinfo agents such as Jim Hoffman. The physical evidence scientifically proves that a Boeing dit NOT hit the Pentagon and you can not dispute it scientifically, just as you can not dispute the scientific evidence of controlled demolition at the wtc. \nKeenan on Sat, 11/04/2006 - 3:02pm.\nNo counter arguments? I linked to Jom Hoffman's essay chock full of counter arguments. The fact that you call Hoffman a \"disinfo agent\" does not make him so. He provides several photos of airplane parts on the Pentagon lawn (unlike Loose Change, which pretends there is only one such photo), and photos of parts inside the Pentagon. Reading fuzzy photos of the Pentagon is not \"scientific evidence\".\nYou mention controlled demolition of WTC, but I never said anything about the WTC. I do believe explosives were used at the WTC. I don't believe a missile hit the Pentagon.\nIt's weak arguments like the No Boeing stuff that continue to undermine this movement. Everytime the Loose Change people are on TV, the first question they are asked has to do with their theories concerning the Pentagon. Why do you think this is so? Because they make dubious claims about the Pentagon, and because No Boeing is a honey pot that will eventually be used to discredit everything 911 truth has done.\nThe better questions to ask about the pentagon are, how did anything hit the pentagon? and what did cheney order as a plane approached the pentagon? these questions lead to real evidence, not fuzzy photos.\nIn five years since 911 the gov't has had plenty of time to create whatever sort of fake footage they would like.\nWhatever the video contains will have no bearing on what actually happened to the Pentagon.\nA2planet (not verified) on Fri, 11/03/2006 - 7:07pm.\nthat it is \"possible\" that a Boeing 757 hit the Pentagon, virtually all of what little evidence there is shows that one DID NOT.\nI have to pretty much agree with \"most\" of the 9/11 researchers, at this point I would have to say the odds of \"something\" other than Flt 77 hitting the Pentagon are at least 90%.\nThere is a slight chance that a miracle was preformed and Hanjour top gunned his way to hit the Pentagon perfectly or that it was remote controlled and some how managed to fly 20' off the ground at 500mph and \"flow\" into a 16' hole while the Government kept a video tape clearly showing it -hidden for 5 years, but dont sound to feasible to me.\n"} +{"added":"2023-04-07T15:24:40.687431+00:00","created":"2020-03-29T07:55:25Z","id":"http://9crimes.org/charlesxavier/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T07:55:25Z","digest":"sha1:HVHHQZMGR6WXNDD5I6M5DMLJU4VUXFTY","language":"en","language_score":0.95,"length":526,"line_ids":[4,6],"nlines":2,"original_length":1752,"original_nlines":14,"perplexity":168.2,"provenance":"cc_en_head-0112.json.gz:8","source_domain":"9crimes.org","title":"X » charles xavier aka professor x fanlisting","url":"http://9crimes.org/charlesxavier/"},"source":"common-crawl","text":"Welcome to X, the Charles Xavier fanlisting, currently listed under the Animation, Comics and Characters: Book/Movie \u0010\u0010categories at The Fanlistings Network. Charles Xavier is a fictional character from the X-Men series. If you're a fan of Charles Xavier, please consider joining the ever growing list of fans from around the world!\nThis fanlisting was opened on November 10, 2014. There are currently 4 fans listed, with 0 waiting to be added. It was last updated on March 9th, 2020. Welcome to our newest member(s), LINDSAY!"} +{"added":"2023-04-07T15:24:40.735716+00:00","created":"2020-03-29T08:43:41Z","id":"http://9pmstudios.com/9pm-studios-lends-support-to-city-dogs-rescue-city-kitties/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T08:43:41Z","digest":"sha1:6G7QCPPA4CIMXVSDSZQI2SJWNW5V7VEK","language":"en","language_score":0.94,"length":1562,"line_ids":[9,10,11,12,13,14,15],"nlines":7,"original_length":2706,"original_nlines":42,"perplexity":197.8,"provenance":"cc_en_head-0112.json.gz:9","source_domain":"9pmstudios.com","title":"9pm Studios lends support to City Dogs Rescue & City Kitties • 9pm Studios","url":"http://9pmstudios.com/9pm-studios-lends-support-to-city-dogs-rescue-city-kitties/"},"source":"common-crawl","text":"For the fourth consecutive year, 9pm Studios is proud to have the opportunity to support the City Dogs Rescue & City Kitties Annual Online Auction.\nThe City Dogs Rescue & City Kitties online auction showcases artists, restaurants, theaters, and small businesses from across the DC metropolitan area. From wine tastings in Virginia’s rolling hills to sessions with expert pet photographers, you’re sure to find new products and experiences you love, while helping to save and care for dogs and cats from overcrowded and high-kill shelters.\nAmong the hundreds of items up for auction is $500 worth of 9pm Studios consulting services, including web design, digital marketing, and graphic design. Grab one of our $250 vouchers up by bidding here and here.\nBidding is open until May 21 at 11 pm.\nCity Dogs Rescue & City Kitties is a 501(c)(3) non-profit organization based in Washington, D.C., that adopts dogs and cats from overcrowded and high-kill shelters. Since its establishment in 2011, City Dogs Rescue & City Kitties has saved more than 4,000 dogs and cats! We have gained a strong community of volunteers along the way.\nIn Washington City Paper’s Readers’ Poll for the last four years (2015-2018), City Dogs Rescue & City Kitties was voted Best Non-Profit. We also were voted “Best Place to Volunteer” and “Best Charity Event” in 2018. The Catalogue for Philanthropy recognizes City Dogs Rescue & City Kitties as one of the top charities in greater Washington based on its impact, financial viability, and transparency."} +{"added":"2023-04-07T15:24:41.062343+00:00","created":"2020-03-29T08:32:21Z","id":"http://activecities.com/blog/history-of-ice-sports/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T08:32:21Z","digest":"sha1:QOJMDT63IPHHXYH7S63AMFBFJQ2OOLPX","language":"en","language_score":0.97,"length":9148,"line_ids":[31,34,35,36,37,38,39,40,42,43,44,45,47,48,49,50,51,53,54,55,57,58,59,61,68],"nlines":25,"original_length":10546,"original_nlines":101,"perplexity":187.1,"provenance":"cc_en_head-0112.json.gz:10","source_domain":"activecities.com","title":"Everything Is Cooler On Ice | A History Of Ice Sports","url":"http://activecities.com/blog/history-of-ice-sports/"},"source":"common-crawl","text":"Ice and snow are a fact of winter in most parts of the world. In some places, they’re even more common. Over the millennia of human history, people have developed plenty of ways to have fun with icy conditions. Some of these activities have evolved into competitive events played all over the globe. Here are some of the ice sports with the longest histories.\nSleds and sledges have been used since ancient times, particularly in areas with heavy snow cover. Today, there are three main competitive sledding sports: bobsled, luge, and skeleton, which are all derived from the same situation.\nIn the Swiss spa town of St. Moritz, hotelier Caspar Badrutt frequently hosted English tourists. In the 1870’s, guests at Badrutt’s hotel started using delivery sleds to race down hills and roads in St. Moritz. Sledders frequently interfered or collided with pedestrians, which, coupled with the roads’ many twists and turns, required a means of steering the sleds. This led to two developments: First, Badrutt built an ice track designed for recreational sledding. Second, Badrutt’s guests invented the three modern competitive sleds.\nBobsleds (or bobsleighs) are steered with two metal rings attached to a pulley system that turns the front runners. At the high speeds of competitive bobsleigh, turns must be subtle to prevent crashing. Passengers sit. At one time, five or six people would be on a bobsled crew, but in the 1930’s, the crew size was reduced to two or four people. In a two-person team, a pilot steers and a brakeman pulls the brake lever at the end of the race. A four-person team also has a pilot and brakeman, but adds two pushers.\nLuges are manned by one or two people. Lugers lie on the sled feet-first and steer by applying pressure to the runners with their calves or shoulders. The first record of the worbobsled oldd’s use is from 1905, from a Swiss dialect of French in which “luge” referred to a “small coasting sled.” The gloves worn for luge racing have tiny spikes built into the fingertips, allowing for maximum speed gain at at the start.\nSkeleton sleds are named for their simple, bare appearance. There are no steering or braking mechanisms; riders steer only by shifting their body weight and usually brake with their boots. Top skeleton riders can reach speeds upwards of 75/mph.\nSometimes referred to as “chess on ice”, curling is played by two teams of four players each. The length of ice used for play is called a curling sheet, marked with a circular target area (or “house”) at each end. Players take turns sliding round stones of polished granite to the opposite house. Two sweepers use brooms on the ice to direct the stone’s path. When each team has thrown all eight of their stones, an end concludes, and points are counted according to the stones closest to the center of each house. Most games have eight or ten ends, this is why people go crazy in order to play now.\nCurling was invented in medieval Scotland, and today the World Curling Federation has its headquarters in Scotland. Curling existed at least as early as 1511, based on the year’s inscription on a curling stone found in a drained pond in Dunblane. The first reference in writing appears in a document from 1541, but the word “curling” was not used to describe it in print until a 1620 poem by Henry Adamson. The word refers to the motion of the stone across the ice.\nIn the early days of curling, stones were not uniform. Players used flat-bottomed river stones of varying size and shape, contributing to poor control after a throw. This is a contrast to modern curling, in which sweepers can direct the stones and do so deliberately. Other regions tried to solve the inconsistency with wood, irons, or tins filled with ice.\nCurling has been on the world stage since 1959, when the first world championship was held in Scotland. Only men were allowed to participate in the first Scotch Cup. Demonstration matches were held at the Olympic Games in 1924, 1932, 1988, and 1992, but curling was not an official Olympic sport until 1998. It remains so today, with separate tournaments for men and women.\nLike sleds, ice skates have been around for a long stretch of human history. The oldest evidence comes from a pair made of animal bone found at Lake Moss, Switzerland. These skates are dated to 3000 B.C. The earliest known reference in writing is from the 12th century, which also describes edgeless skates made of bone. The Dutch invented the first steel-edged ice skates in either the 13th or 14th century, and a height-to-width ratio for blades was established shortly afterward. The original Dutch designs have been only minimally altered in the centuries since their invention.\nThe first ice skating association was the Edinburgh Skating Club, formed in 1742. Contemporary writing suggests that the Club practiced an early form of figure skating, but figure skating didn’t become “official” until 1772. Robert Jones, an artillery lieutenant in the British military, wrote and published an instructional book on ice skating that went over basic forms. The publication of this book is seen as the point when skating split into figure skating and speed skating.\nCredit for the invention of modern figure skating goes to an American, Jackson Haines. In 1864, he won a skating championship by being the first skater to use dance and ballet in his movements, rather than following the British method of tracing patterns on the ice. He invented the sit spin skating move and a blade with a shorter length and curve that made turns easier. Americans and Britons still preferred the British mode for some time, but Haines popularized his forms in Europe.\nVienna was particularly impressed, and the Vienna School developed his techniques, eventually forming the International Skating Union in 1892, seven years after Haines’ death. As the world’s first international organization for ice skating, the Union codified rules for figure skating. It still exists today, though it has moved from its original home in the Netherlands to Lausanne, Switzerland.\nFigure skating first appeared at the Summer Olympics in 1908. It was the first winter sport to be held at the Olympic Games, 16 years prior to the first dedicated Winter Olympics.\nA variety of stick-and-ball games have existed for millennia, going back to before Christianity. In Europe, most of these games were not far removed from field hockey, such as Ireland’s hurling, Scotland’s shinty, and England’s “bandie ball” (now commonly called bandy). The Dutch and Norse played similar games on ice. But the first evidence of modern ice hockey comes from the 1700’s, when people in Britain would play “hockey on ice”. Rather than a ball, they would use a cork stopper from a barrel, called a “bung”. An engraving from 1797 shows someone ice skating on the River Thames with a stick and bung.\nWhen British soldiers and immigrants went to Canada and the United States, they brought their stick-and-ball games. They played “hockey on the ice” during cold weather. Informal games borrowed from Native American stick-and-ball games. In particular, a Mi’kmaq game similar to lacrosse lent physical aggression to informal games of hockey.\nThe first organized game of indoor hockey took place in Montreal on March 3, 1875. It was played between two teams of nine players. They used a circular, flat piece of wood rather than the more common bung or ball. The rules were derived from those used by England’s field hockey organization. The first hockey club was the McGill University Hockey Club, established in 1877, and team size went from nine to seven by 1880 (now hockey is played by six to a team, including the goalie). The first professional hockey players played for the Western Pennsylvania Hockey League by 1902.\nBecause both sports require ice skates, the history of speed skating is rather similar to that of figure skating. Skating clubs in Norway began holding organized races on ice skates in 1863, probably the first organized competitions. Tracks were not of standardized length until 1892, when the International Skating Union set the length at 400 meters. World championships used race distances of 500 meters, 1,500 meters, 5,000 meters, and 10,000 meters. Speed skating’s first Olympic feature was in 1924 at the first winter games.\nIn short track speed skating, four to six skaters start at the same time. They race in lanes around a rink with the same dimensions as an internationally-sized hockey rink. Long track speed skating is done in pairs on an oval ice track similar to an outdoor track course. The track has two lanes, and skaters switch lanes during their laps so they can cover the same distance.\nWhether for fun or glory, ice sports are a time-tested way to have fun in cold conditions. This list only accounts for some of them; if you’re at a loss, you can always fall back on the good ol’ snowball fight.\n"} +{"added":"2023-04-07T15:24:41.965337+00:00","created":"2020-03-29T08:46:36Z","id":"http://ajitucapoeira.com/history.php","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T08:46:36Z","digest":"sha1:WOLLOOSANGRKIPWK5TFZ4VOMYTJZFLIS","language":"en","language_score":0.98,"length":11274,"line_ids":[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48],"nlines":37,"original_length":11699,"original_nlines":54,"perplexity":297.5,"provenance":"cc_en_head-0112.json.gz:12","source_domain":"ajitucapoeira.com","title":" Ajitu Capoeira - History","url":"http://ajitucapoeira.com/history.php"},"source":"common-crawl","text":"Brazil's history, let alone Capoeira's, can be contradictory, peculiar and complex due to an influx of African, European and native cultures interacting in all aspects of life in Brazil. Thus creating a unique melting pot in which each culture tried to (and successfully did) preserve their identities in different regions, aspects and social levels throughout the country.\nWhat we do know is that although Capoeira has no written history until the 18th century, it definitely existed. This is most obviously portrayed by the art's oral history and tradition. Most, if not all, of the songs your teacher or Mestre know is from hearing the song from their teacher, friend and family etc, and so this goes hand in hand with other knowledge of the art.\nCapoeira has a huge oral tradition and this begins when the Portuguese first descended upon Africa to procure slaves through any means necessary. The Portuguese had already been to Brazil however. Seen as an excellent outpost for the Portuguese empire, it was close to Africa and India for convoys of trade. In 1500 they first set foot in Brazil and interacted and co-existed with the indigenous native tribes of Brazil, mainly the Tupi, and for 322 years Brazil was a Portuguese colony.\nThe Portuguese had massive plantations that they needed the natives to work on.\nAt first they purchased their slavery, exchanging all sorts of items but nothing of enough worth. Then they simply began forcing them to work with muskets and whips.\nEventually the Portuguese needed and wanted slaves from Africa, of which they already had hundreds of thousands in other parts of their empire for a century before they colonized Brazil. So it was a natural step for them to transport more captive Africans to Brazil. They came predominantly from Sudan, Guinea and Angola, though in those days borders were not so strict and there were slaves from Congo and Mozambique too. Tribes called Yoruba, Hausa and Bantu to name but a few. At first they enslaved only those whom they found got in their way, though it does not take long for power to corrupt and eventually slave trade was a business itself, human beings thought of as merchandise.\nInitially the cost of this was \"cheap\" and being greedy they over crowded the vessels which transported the slaves to Brazil. Which, back in the 16th century this was not an easy task, with terrible conditions for the slaves during a three month journey; they lost approximately 400,000 lives, bodies thrown away in to the deep Atlantic sea. With a high mortality rate of slaves in Brazil there were approximately 4,000,000 (four million!!!) slaves traded until 1850, when the Queiroz law abolished slavery. Though it continued illegally until 1888/89!\nThe slaves were always mixed, held in different plantations and senzalas (slave quarters) to familiar people (friends and family especially) and so making organized rebellions difficult. Nevertheless, as well as constant individual escapes, there were recorded revolts against their captivity and many times when they escaped they formed independent 'slave' villages in the jungles, forests and 'backlands', which were called Quilombos.\nThe slaves were not allowed any luxuries, especially the practice of fighting. There are many theories as to the actual birth of Capoeira, and there is no definite, finite birth or date where suddenly someone said this is Capoeira.\nIn 1890, the first Brazilian president Marechal Deodoro da Fonseca created the Código Penal, which contained an act that prohibited the practice of Capoeira nationwide, with severe punishment for those caught. Regardless of this it remained, practiced by the poorer population on public holidays, during work-free hours and similar occasions. Riots caused by police interference were common. The nick-names in Capoeira were predominant from the early days of the art but became more popular once Capoeira was officially illegal. This is something we keep today as a tradition.\nAnd so it was a bad time to be a Capoeirista.\nCapoeira even before this was often attributed to sneaky, violent criminals who were low- lives, sons of slaves etc... Much of the time Capoeira was played in like a death game, sometimes with razor blades between their toes when they fought or a hidden knife in their trousers, inviting a trend of silk scarves around the necks of Capoeiristas at rodas everywhere.\nAnd it wasn't just the razor blades, due to the quality of life and the association with Capoeira a lot of people were struggling just to survive day to day life, many carried knives and weapons which unfortunately came out in rodas too. Capoeira was never a popular art form (in fact it wasn't even known as one until the 30s/40s) and so when it became officially illegal many Capoeiristas turned to crime, of all sorts. Being self confident, with a fighting spirit and anger at the sudden 'freedom' many were given it was not a surprise that with nowhere to go and nothing to do they turned to crime.\nIt was also during this period that great Capoeiristas would rise up to fight off the persecution and repression of Capoeira. While the art had spread in this early period the police and government were cracking down so hard that it was all but extinguished in Rio and Recife. Recife had seen the Capoeira turn into a protection detail for the bands during Carnival and big events. A big band would march with a Capoeirista at the forefront and whenever another band was encountered there would be bloodshed. The Capoeira in Rio was simply more of a rogue/ criminal activity, in fact gangs had been formed that were so dangerous and powerful the government would use them to fend off rallies and unpopular group behaviour.\nBrazil was forcing Capoeira into a corner; one which it did not want to be in and for a while did not know how to get out of.\nCapoeira does not go away. No matter what the police did, who they killed, beat, tortured or corrupted it was always there, underneath the eyes of the authorities and people who didn't know what to look for. This period in time would turn out to be one of the most important periods in the already long saga of Capoeira.\nThere were legendary figures like Besouro, who had Corpo Fechado (closed body - a supposed magic of invulnerability) and Nascimento Grande who fought police when they had to and always came out on top.\nOnly 3 years prior to the start of the 1900s a momentous occasion happened for one young 8 year old name Vicente as he was invited by an African named Benedito to try Capoeira. Two years later Manuel Dos Reis Machado would be born into the world. It would be these figures that would go on to change the shape of the Capoeira world for ever...\nThese two legends (Mestre Pastinha & Mestre Bimba) were so integral to the twentieth century of Capoeira that even now all we do as Capoeiristas is in reverence to them, they are ancestors to us all and we pay homage and thanks for all that they did. They looked at the Capoeira and while one took it and repackaged it with great patience and skill the other sought to protect it, wrap it up with strength and intelligence to safeguard it for the future.\nIn this way, both Mestres's lifetimes of work coincided with each others, Mestre Bimba having ushered in a new wave of popularity with its legalisation and incorporation of athletic, martial aspects into Capoeira Regional. Mestre Pastinha knew that it would be wrong to lose the original style of Capoeira and so made his work in preserving every aspect and tradition of what would become Capoeira Angola.\nCapoeira had predominantly, in the eyes of many, gotten sloppy. Lacking discipline and organization, which would come soon enough, but not everyone would follow it but use its example. It is during this time that the police were becoming more intolerant of Capoeira due to the abolishment of slavery; they could no longer 'officially' treat some as lesser people and ignore their human rights.\nA rhythm for the Berimbau was also created in the early days of Capoeira; Cavalaria (Cavalry). The rhythm mimics the sound of mounted police, who were 9 times out of 10 the ones coming to arrest and punish the Capoeiristas having a roda or practising.\nTo this day our Mestres, professors and instructors like to play this rhythm to check our attention and also remind us of how lucky we are not to have to think that way anymore.\nIt is one of the most significant points in the History of Capoeira, the most significant in the History of Capoeira Regional and a massive point in the History of Brasil. It is 1930, in Salvador, Bahia, Mestre Bimba had cultivated his new style of Capoeira to a peak from which it would only climb higher and never look down. Mestre Bimba was invited to perform a demonstration of his Capoeira at the Governor of Bahia, Juracy Magalhães' palace. It was this demonstration that finally convinced the authorities of the value of this sacred art, culturally, historically and spiritually.\nCapoeira became 'legal' for the first time in the Country's history.\nFrom July 9th 1932 Mestre Bimba 'officially' opened his school CCFR - Centro de Cultura Fisica Regional, where his numerous students who were there at the birth of Capoeira Regional expanded ever onwards. To this day every single Capoeira Regional group can trace its origins back to Mestre Bimba himself through one or more of his students. His students remain some of the most brilliant (and famous) Capoeirista's to date: Mestre Decanio, Mestre Camisa Roxa, Mestre Acordeon, Mestre Itapoan, Mestre Onca Tigre (RIP) to name but a few.\nIn the '40s many Capoeiraistas began to move to Rio de Janeiro (and later Sao Paulo and so on), most of whom had trained with Mestre Bimba in Salvador, Bahia. Capoeira has now exploded across the world, in North America, Europe, Africa, Asia and Australasia! With some groups teaching in nearly every continent, Capoeira has continuously grown throughout the world spreading the knowledge and love of this beautiful art.\nMestre Pastinha would be happy to know that although this growth came predominantly from Capoeira Regional, that (according to Nestor Capoeira) people began to take a great interest in Capoeira Angola in the '80s. People were flooding to the old masters, who fortunately were still around, to find out about this great traditional style of the art form, where they could learn more about the mandinga (magic/sorcery) and spirit of Capoeira.\nThe last of the styles, Capoeira Contemporanea, was created from the two already existing styles Capoeira Regional of Mestre Bimba and Capoeira Angola of Mestre Pastinha. One of the pioneers and founders of the new style was Mestre Arraia. It is an evolution of the two older styles; it is as much homage to the days gone by as it embraces the future. Today Capoeira is always around us, it has become a way of life for a lot of us.\nCapoeira groups today may have slightly different styles, names of songs, names of moves, graduation system and so on, but one thing we all share is the love and desire to spread our art to everyone we can.\n"} +{"added":"2023-04-07T15:24:42.017872+00:00","created":"2020-03-29T09:30:25Z","id":"http://akindleinhongkong.blogspot.com/2012/02/walking-tour-stanley.html","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T09:30:25Z","digest":"sha1:6YUKMQTG2GOJDVAOJJS4F7Z23ZWBX6BE","language":"en","language_score":0.96,"length":2890,"line_ids":[9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,45,46,50,51,61,62,67],"nlines":39,"original_length":6354,"original_nlines":225,"perplexity":314.3,"provenance":"cc_en_head-0112.json.gz:13","source_domain":"akindleinhongkong.blogspot.com","title":"A Kindle in Hong Kong: Walking Tour: Stanley","url":"http://akindleinhongkong.blogspot.com/2012/02/walking-tour-stanley.html"},"source":"common-crawl","text":"This walking tour is the first from the far side of Hong Kong Island. You need to take the bus or a private car to get to Stanley, but it is well worth the trip.\nArrive at Stanley Plaza, the local shopping mall. You can park here and then walk down the steps by the Banyan trees.\nThere's an open square with a clean, healthy vibe.\nWalk past the kids' playground on your right.\nIt was particularly busy during Chinese New Year, and you could almost see the smoke from all the incense floating out the door.\nWalk along the paved corridor to your left. It's a straight shot from the temple to the sea.\nAt the waterfront you'll find the Maritime Museum.\nThere are tasteful nautical decorations nearby.\nThe museum is in the old ferry terminal building.\nFrom the pavement just outside you can see the pier on your right.\nLook left to see the fishing sampans and the rest of Stanley village.\nSince it was a public holiday, all the boats seemed to be in the bay.\nTurn left and walk along the boardwalk.\nFirst, you'll see another little temple...\n...but the rest of the street is filled with Western restaurants and pubs.\nWe saw pretty village houses...\n...and quite a few dogs wearing clothes.\nAt the end of the street, turn left and walk past the Fishermen's Recreation Club.\nFollow the colors to the shop at the end of the street.\nYou can stop for a fresh coconut drink if you need a break.\nTurn right in the direction of more shops.\nAt this point you're on Stanley Market Road.\nTurn right again and make your way into the market.\nYou'll see little shops selling clothes in the alleyways.\nStanley Market caters to tourists.\nIt wasn't too crowded on the day we were there. Many of the shopkeepers were making the most of the holiday.\nWhen you emerge from the market, you'll find you've made a loop back to the water.\nLook across the bay to the pier you saw earlier.\nAnd then make your way back along the waterfront.\n\nIt looks like Stanley hasn't changed much since I last went. The bus ride there is one of my favorites anywhere, and always made me think of a European riviera. I can't wait to go back! The people in your last photo were dressed for winter. I hope it's not too cold there.\nI was wondering when you'd do a Stanley tour, since I grew up here :) Though of course it had changed a lot since I left..."} +{"added":"2023-04-07T15:24:44.321465+00:00","created":"2020-03-29T09:37:26Z","id":"http://artsemersonblog.org/2018/09/04/from-russia-with-translation/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-16/segments/1585370494064.21/wet/CC-MAIN-20200329074745-20200329104745-00210.warc.wet.gz","date_download":"2020-03-29T09:37:26Z","digest":"sha1:DWRO6UW22GPL4V7RUHPRTUQAMJUVDNYR","language":"en","language_score":0.95,"length":3912,"line_ids":[29,30,32,33,34,35,36,37,38,42,43,51],"nlines":12,"original_length":5307,"original_nlines":63,"perplexity":309.8,"provenance":"cc_en_head-0112.json.gz:17","source_domain":"artsemersonblog.org","title":"From Russia with Translation - ArtsEmerson Blog","url":"http://artsemersonblog.org/2018/09/04/from-russia-with-translation/"},"source":"common-crawl","text":"Shakespeare’s legacy is rooted in language. The majority of students in the United States study Shakespeare in English classes throughout high school and the education surrounding Shakespeare curriculum is rooted in the linguistics and poetic form. His influence on English has survived centuries and his contribution to our vocabulary is often taken for granted. In Measure for Measure alone, we can thank Shakespeare for giving us the words “belongings,” “gnarled,” and “sanctimonious,” amongst a plethora of idioms and phrases. The intense study of Shakespeare’s English and the power of his words have fueled scholars and performers for centuries, keeping Shakespeare the most produced classic author.\nHowever, what happens when Shakespeare’s works are translated into other languages? Do the power of his words still retain their strength? In ArtsEmerson’s upcoming presentation of Measure for Measure, we invite audiences to hear Shakespeare’s words like you’ve never heard them before.\nFor Shakespeare especially, the universality of his work travels from culture to culture, language to language, without losing an ounce of substance. The stories remain intact, the emotions are still felt, and even those who cannot understand the words are able to understand the humanity. The human emotion Shakespeare is able to grasp is not bound by language. While his poetry is considered some of the most remarkable work we have, it possesses power beyond the page. How we communicate is often through unspoken means and Shakespeare, amongst his new words and famed phrases, allows for the breath for that very human connection on an emotional level.\nWe cherish his words, but perhaps Shakespeare’s ability to create connection is what keeps audiences entranced by the Bard’s folio. Shakespeare’s plays have been translated in over 80 languages, including Star Trek’s Klingon, making it remarkably clear that while he wrote in English, his plays resonate worldwide (and even into space apparently). With the upcoming production of Cheek by Jowl and Pushkin Theatre’s Measure for Measure, we invite audiences to engage with Shakespeare and perhaps experience something entirely new.\nCheek by Jowl and Pushkin Theatre’s Measure for Measure will be at the Emerson Cutler Majestic from OCT 24 -28.\n"} diff --git a/tests/data/multiple_files/cc_en_head-0174.jsonl b/tests/data/multiple_files/cc_en_head-0174.jsonl new file mode 100644 index 00000000..5d013d70 --- /dev/null +++ b/tests/data/multiple_files/cc_en_head-0174.jsonl @@ -0,0 +1,10 @@ +{"added":"2023-04-07T03:31:49.216968+00:00","created":"2020-06-05T04:25:29Z","id":"http://archives2.getty.edu:8082/xtf/view?docId=ead/2002.M.13/2002.M.13.xml","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T04:25:29Z","digest":"sha1:TC6SYGWAU4AT53EYYNVUEAZV7ESWS2R4","language":"en","language_score":0.94,"length":1264,"line_ids":[4,14,15,16,17,18,19,21,22,23,25,33],"nlines":12,"original_length":2500,"original_nlines":37,"perplexity":337.9,"provenance":"cc_en_head-0202.json.gz:7","source_domain":"archives2.getty.edu:8082","title":"Arntz (Wilhelm) collection of rare exhibition catalogs and printed ephemera Finding aid for the Wilhelm Arntz collection of rare exhibition catalogs and printed ephemera, approximately 1900-1985","url":"http://archives2.getty.edu:8082/xtf/view?docId=ead/2002.M.13/2002.M.13.xml"},"source":"common-crawl","text":"Creator/Collector: Arntz, Wilhelm F.\nAbstract: Collected by the German art expert Wilhelm Arntz, the ephemera document exhibitions, sales and publishing of 20th century art mainly in Germany, Italy, and France, but also in other European countries, the United States, and South America. Represented are not only well-established artists but also a profusion of lesser known artists as well as numerous emerging artists who became well-known after the mid 1980s. Among the institutions are European and American art museums and exhibition venues, publishing- and auction houses, printing presses, art fairs, and a vast number of art galleries.\nLanguage: Collection material is predominantly in German, English, French, and Italian, with most other European languages also present."} +{"added":"2023-04-07T03:31:49.631789+00:00","created":"2020-06-05T02:40:44Z","id":"http://artseast.blogspot.com/2019/10/gillian-smith.html","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:40:44Z","digest":"sha1:UDYSZKQ4LR6C4EAVJX557NOFBIKKESCA","language":"en","language_score":0.97,"length":4827,"line_ids":[21,22,23,24,25,26,27,29,31,32,33,34,35,36,37,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55],"nlines":32,"original_length":6543,"original_nlines":189,"perplexity":309.4,"provenance":"cc_en_head-0202.json.gz:8","source_domain":"artseast.blogspot.com","title":"Gillian Smith","url":"http://artseast.blogspot.com/2019/10/gillian-smith.html"},"source":"common-crawl","text":"Nova Scotia violinist Gillian Smith just released her debut recording Into the Stone. Recently we spoke with Smith about the album, and about her passion for her profession.\nWhen and why did you develop a passion for music?\nWhen I first started learning the violin with Ninette Babineau and Pat Wyman in Halifax, I fell in love with the instrument. I just loved the sound! Later on I discovered string chamber music and was completely hooked - I remember listening to a recording of the Brahms viola quintets over and over again, thinking that it was the most amazing thing.\nI still love the violin as much (or more) than ever, and I also find teaching the violin very rewarding. I did my doctoral degree in violin performance with the American violinist Jorja Fleezanis, who is a huge proponent of new music (the John Adams violin concerto is dedicated to her and was premiered by her), and really discovered a love for new music in my studies with her.\nI’ve worked to try to take more risks musically and to try to create more colours and sounds, and to expand my range of expression.\nSometimes there are moments of self-doubt, but you just have to push through those!\nIt’s incredibly rewarding to perform and record great music, and to work with wonderful musicians.\nI’d like to think that I can create a beautiful sound on the violin, with a range of different colours that I can draw upon. It’s definitely exciting to try to find new sounds and new ways of playing something.\nI feel very lucky to have been part of recordings of the amazing music of Derek Charke (Live Wired and In Sonorous Falling Tones, which was nominated for an ECMA for Classical Recording of the Year in 2018) with Derek Charke, Mark Hopkins, Mark Adam, Jeff Hennessy, and others, and more recently to have been part of a recording of the wonderful chamber music of Carmen Braden, Songs of the Invisible Summer Stars, which was released in September 2019, with Mark Adam, Derek Charke, Suzanne Lemieux, and others.\nI really wanted to create a recording of music by Canadian women, and I was thrilled to have the opportunity to record phenomenal music for solo violin by the amazing composers Kati Agócs, Alice Ping Yee Ho, Veronika Krausas, Chantale Laplante, and Ana Sokolović.\nI was very lucky to work with Jeremy VanSlyke of Leaf Music, who produced the album. We recorded in the stone sanctuary of First Baptist Church Halifax, and it was wonderful to be able to make the recording in that beautiful space. I’m extremely grateful to FACTOR, the Government of Canada, and Canada’s private radio broadcasters for their support for Into the Stone.\nAlice Ping Yee Ho’s Caprice is a joyful, energetic work that really captures an emotion so vividly. It’s so much fun to play this piece!\nI’m grateful for the positive feedback I’ve received so far! One of the tracks from the album - Veronika Krausas’ piece Inside the Stone - was recently selected as a Spotify New Classical Release Pick, which was very exciting.\nPreparing to record the album definitely required some hard work, but at the same time I was striving to follow the composers’ directions in the music, which reveal their concept of the works, and to bring my own musical imagination to the process.\nWhat makes a good tune?\nA piece that really captures a mood or emotion; a strong rhythmic element can also be really compelling.\nWhat are your thoughts on Atlantic Canada’s music scene?\nI’m thrilled to be part of Atlantic Canada’s fabulous music scene and am so grateful for the wonderful collaborations and projects that I’ve been involved with here with such amazing musicians.\nI’m most grateful for FACTOR’s support for this album, and it’s wonderful that there are many organizations from both the government and the private sector that provide support for musical projects of various kinds. It could only be a good thing if there were even more of this kind of funding, which makes projects like this one possible.\nWhat do you have on tap for the rest of 2019 and going into 2020?\nAfter the Toronto show I am looking forward to performing some great repertoire for violin, clarinet, and piano, including the fabulous Bartók Contrasts and Stravinsky’s L’Histoire du Soldat, with the wonderful Eileen Walsh, clarinet, and Simon Docking, piano.\nGillian Smith will launch Into the Stone in Toronto at the Glenn Gould Studio on Saturday, October 5 with special guest Emily Rho, piano.\n"} +{"added":"2023-04-07T03:31:53.457061+00:00","created":"2020-06-05T02:23:04Z","id":"http://blog.captainthin.net/?tag=kjv-only","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:23:04Z","digest":"sha1:NIHTK7DG3BXYNMNXDJLB5CQTHVAMMBZH","language":"en","language_score":0.91,"length":1817,"line_ids":[6,9,12,13,14,15,16,17,18,20],"nlines":10,"original_length":4019,"original_nlines":68,"perplexity":312.7,"provenance":"cc_en_head-0202.json.gz:17","source_domain":"blog.captainthin.net","title":"KJV only « Captain Thin","url":"http://blog.captainthin.net/?tag=kjv-only"},"source":"common-crawl","text":"Entries tagged with “KJV only”.\nSometimes we get strange things in the mail at work. Today is one of those days. I received a package with all sorts of… er, “useful”… information, including photocopies of some prophetic end-of-the-world magazines . One of the photocopied articles claims that the Vatican and Muslims are about to fight out a new Crusade over Jerusalem (because, you know, Catholics=Bad; who else would be responsible for a new crusade?).\n1) A list of heresies (quite handy), which notes such heretical acts as the use of “wax candles” in church.\n3) A photocopy of a handwritten statement on the dangers of electromagnetic radiation. But you can apparently reduce the danger by purchasing Tesla Purple Free Energy Plates. Good to know.\n4) A note that only those who tithe exactly 10% (that’s before deductions, you sinner!), observe the Sabbath from Friday sundown to Saturday sundown (not Sunday, you heretics!), and use the King James Version of the Bible (drop that NIV and ESV, you reprobates!) will be saved. “Everyone else is under a curse and will be eliminated when Jesus Christ returns in the near future.” (One of the end-of-the-world article photocopies dates back to 1984, so I guess “near future” is a relative term).\n"} +{"added":"2023-04-07T03:31:53.593808+00:00","created":"2020-06-05T02:49:55Z","id":"http://blog.kevinmay.com/tag/santa-fe/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:49:55Z","digest":"sha1:GQLIUZHMUWAIRMBGJVZSROCQYRO5YX5E","language":"en","language_score":0.94,"length":2861,"line_ids":[14,17,18,19,20,21,22,25,26,29,30,31,32,33,36],"nlines":15,"original_length":5171,"original_nlines":189,"perplexity":295.6,"provenance":"cc_en_head-0202.json.gz:18","source_domain":"blog.kevinmay.com","title":"Santa Fe | Kevin May Photography","url":"http://blog.kevinmay.com/tag/santa-fe/"},"source":"common-crawl","text":"Walking around Santa Fe!\nMonday morning Peoria greetings to one and all!!!!!\nIt sounds like we’re going to get popped with some more winter weather today here in Central Illinois! One of these days winter will be over, but probably not soon enough!!!!\nWhen I was out in Santa Fe, N.M. a couple of weeks ago, I was able to spend a day walking around the downtown and shooting some photos! For me it’s always fun to get out and do this and it’s even more fun in a town like Santa Fe. The Southwest has a lot of great architecture and it’s just so different then what we see here in the Midwest! Lots of warm adobe tones and lots of turquoise colors really work well together and make for a lot of great photos!\nCathedral Basilica of St. Francis of Assisi and is a statue of Kateri Tekakwitha. She is the first Indian of North America to be promoted a saint. The photo on the left is a statue at the Museum of Contemporary Native Arts called “Water Carrier”. The center photo is the inside of the Cathedral Basilica of St. Francis of Assisi. The last photo on the right is of the Santuario de Nuestra Señora de Guadalupe.\nIt’s time to get to work, so I hope that you have a wonderful week and thanks for checking out my work!!!!\nPhotography in Santa Fe, NM.\nCold Peoria Monday morning greetings to one and all!!!!\nI was out in Santa Fe last week doing a shoot for my friends at Modern Quilts Unlimited magazine! We were shooting on location in Santa Fe at the beautiful Las Palomas B&B and it made the perfect location for all these quilt shots that we did! We were able to shoot in a bunch of the different rooms there and it made for some very interesting backgrounds! Some of photos were also taken outdoors and we really lucked up because the evening after we finished shooting, Santa Fe got four inches of snow and that would have ruined any outdoor photography!!! Fortunately for us we were able to get the shots that we needed, so everything worked out great!\nHere’s one of my shots from Las Palomas and I think everyone is very happy with the shoot!!!!\nThanks as always for taking the time to check out my work and I hope that you have a great week to come!!!\n"} +{"added":"2023-04-07T03:31:53.686329+00:00","created":"2020-06-05T01:50:28Z","id":"http://blog.stevengriggs.com/2013/05/who-do-you-see-when-you-look-in-mirror.html","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T01:50:28Z","digest":"sha1:HNIN6XT3NWQYLM5MA444IIRZQLS3RPEN","language":"en","language_score":0.98,"length":4117,"line_ids":[4,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,47],"nlines":35,"original_length":7007,"original_nlines":92,"perplexity":266.6,"provenance":"cc_en_head-0202.json.gz:19","source_domain":"blog.stevengriggs.com","title":"Who Do You See When You Look In The Mirror | The Freedom Blog by Steven Griggs '; } } if( dayCount > fill[valxx]){ cell.innerHTML = ' '; cell.className = 'emptyCell'; } dayCount++; } } visTotal = parseInt(startIndex) + parseInt(fill[valxx]) -1; if(visTotal >35){ document.getElementById('lastRow').style.display = ''; } } function initCal(){ document.getElementById('blogger_calendar').style.display = 'block'; var bcInit = document.getElementById('bloggerCalendarList').getElementsByTagName('a'); var bcCount = document.getElementById('bloggerCalendarList').getElementsByTagName('li'); document.getElementById('bloggerCalendarList').style.display = 'none'; calHead = document.getElementById('bcHead'); tr = document.createElement('tr'); for(t = 0; t < 7; t++){ th = document.createElement('th'); th.abbr = headDays[t]; scope = 'col'; th.title = headDays[t]; th.innerHTML = headInitial[t]; tr.appendChild(th); } calHead.appendChild(tr); for (x = 0; x (')[1]; var selValue = bcList[r]; sel.options[q] = new Option(selText + ' ('+selCount,selValue); q++ } document.getElementById('bcaption').appendChild(sel); var m = bcList[0].split(',')[0]; var y = bcList[0].split(',')[1]; callArchive(m,y,'0'); } function timezoneSet(root){ var feed = root.feed; var updated = feed.updated.$t; var id = feed.id.$t; bcBlogId = id.split('blog-')[1]; upLength = updated.length; if(updated.charAt(upLength-1) == \"Z\"){timeOffset = \"+00:00\";} else {timeOffset = updated.substring(upLength-6,upLength);} timeOffset = encodeURIComponent(timeOffset); } //]]>","url":"http://blog.stevengriggs.com/2013/05/who-do-you-see-when-you-look-in-mirror.html"},"source":"common-crawl","text":"Who are you? What message are you projecting out to the world? Did you even think you were projecting a message?\nMaybe you are not consciously trying but by not trying to you are still sending a message.\nWho are you trying to notify? What information are you trying to present?\nIt’s all about presentation, how we present ourselves helps notify others about who we are. It helps them see what we believe and where we stand.\nIt’s a warning and a beacon. \nBut the bottom line is that we do it because we all just want to belong.\nIf you were a new visitor to our planet, you would see that we are all basically wearing costumes.\nYour clothes, your shoes, your hair or makeup, everything you are presenting on your body says something about you.\nYour costume identifies your tribe. Or in the case of Amazon Indians, your hair and your tattoos.\nThe motive here is to show other tribe members you belong to their tribe and at the same time show everyone else that you are not a member of theirs.\nOur costumes are really more about exclusion than inclusion.\nSometimes people are a little in your face in their efforts to emphasize that they are not a part of your tribe.\nBut it’s all just an effort to belong, especially in younger people. It’s almost a requirement. They need to break away from their parents, from the establishment, to show their independence.\nAnd that sometimes requires drastic action to make the right statement.\nThey use bright hair colors and drastic styles, tattoos, logo wear and clothes, piercings, etc. to make their point. They use shock and horror as a tool.\nI think we have all been there once upon a time. Maybe some of us were more drastic than others.\nI was 14 in 1964. I was in a band. I grew up at Lake Tahoe and hitchhiked to spend weekends in San Francisco in the Haight Ashbury district. What do you think I was doing? How do you think I looked?\nTalk about shock and horror. I freaked my family out!\nAt least for a minute but then my family is a bit eccentric. Take a look at my mother (Google \"Granny G\". She was a finalist on America's Got Talent.\nBut it’s just a period of transition, a transitioning out of one phase into another. Some phases take longer than others and some phases can become almost permanent. I’m sure you know people who are locked into a phase.\nDoes it matter? I don’t think so.\nBut think of all the tribes that you can immediately identify. You can see them everywhere, every day.\nFor example, look at biker clubs.\nI was at a rally the other day where my friend's band was playing (I was the roadie and part time singer) and there were about 5 or 6 biker clubs there. For me it was very interesting to see the hierarchy being displayed. With the tattoos, the bandannas, the leather, the patches, the prospects trying hard to belong, to earn their full patch. You can see the whole process of belonging in action.\nI have no problem with bikers, I get it. I like bikes. My best friend from childhood has been a Hell’s Angel since the sixties. He’s probably one of the few remaining Outlaws alive from the early days of the club.\nBut they’re tribes.\nLooking from the outside, although we may not really get how unique it is when several million kids throughout America are dressed the same way, we understand.\nWe can see how the need to be accepted by a group of like souls is so important.\nIt’s important to belong, to gain confidence and become more solid. To begin to become who you are.\nIt takes steps, phases and tribes.\nBecause it doesn't stop when you leave one tribe behind, you just become part of another tribe.\n"} +{"added":"2023-04-07T03:31:53.787371+00:00","created":"2020-06-05T02:00:48Z","id":"http://blogs.boardprospects.com/articles/3928691/opendoor-adds-to-leadership-team-board-of-director/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:00:48Z","digest":"sha1:5BAFUQVF3H4APJ2LUOSVG5YXT3G3JGNY","language":"en","language_score":0.97,"length":604,"line_ids":[7,9,18,22],"nlines":4,"original_length":2445,"original_nlines":56,"perplexity":325.5,"provenance":"cc_en_head-0202.json.gz:20","source_domain":"blogs.boardprospects.com","title":"Opendoor Adds to Leadership Team, Board of Directors - U.S. Housing Finance News -BoardProspects","url":"http://blogs.boardprospects.com/articles/3928691/opendoor-adds-to-leadership-team-board-of-director/"},"source":"common-crawl","text":"Between new partnerships and expanding into the mortgage lending business with the launch of Opendoor Home Loans, Opendoor has seen quite a few changes in the past few months. That trend continues for the iBuyer, as its CEO and co-founder, Eric Wu, announced the addition of new leaders to the executive team and board of directors..."} +{"added":"2023-04-07T03:31:54.199022+00:00","created":"2020-06-05T02:45:16Z","id":"http://bookmarktoday.info/page/97/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:45:16Z","digest":"sha1:XJKOSPZ2UGI7SRTGXPOU4CPOTYTCUBOE","language":"en","language_score":0.97,"length":25942,"line_ids":[6,7,8,9,10,11,12,13,16,17,18,19,20,21,22,23,26,27,28,29,30,31,34,35,36,37,38,42,43,44,45,46,47,50,51,52,53,54,55,58,59,60,61,62,63,64,65,67,68,69,71,72,75,76,77,78,79,80,81,84,85,86,87,90,91,92,93,94],"nlines":68,"original_length":27680,"original_nlines":117,"perplexity":314.0,"provenance":"cc_en_head-0202.json.gz:22","source_domain":"bookmarktoday.info","title":"Pet and Animal – Page 97 – Pet Articles","url":"http://bookmarktoday.info/page/97/"},"source":"common-crawl","text":"The building industry has recently been revolutionized. The industry has seen the use of new technology. Building of tall buildings is taking place in the world. This can be attributed to the further research that people are undertaking. New ideas get developed each new day as scholars do more in their work. Competent people are venturing into the industry having have undergone substantial training. Among the several notable advancement in this field of architecture, is the building of skyscrapers. Majority of cities in the world have tall skyscrapers that are being used for business.\nAmazingly, these skyscrapers are architecture using glass as the building material. The finishing of this building is often done using exterior glass wall panels. You will find that some roofs are transparent or translucent. Such roofing system allows sunlight into the room. Several complex designs are being used in establishing these building. Safety is an important aspect that is always considered when undertaking this building works.\nDaylighting systems have been proven to be beneficial in so many ways. Daylighting refers to techniques used to allow essential sunlight into buildings. This article intends to describe some of the benefits of using daylighting systems which allows sunlight into the room.\nThrough daylighting systems, conservation of electricity is possible. It is often expensive to manage electricity deals especially when the electric energy consumption is high. Companies that have not embraced the technology of daylighting often have no other choice but to use electric bulbs. For this reason, the companies that do not use electricity bulbs have significant reduction on their bills. In the long run, the operation cost for such companies become significantly lower. Offices that do not operate during the night, therefore, have no bills arising from electrical lighting systems.\nDaylighting systems ensure that the building acquires an amazing outlook, kindly view here for more details. Most of the designs used in establishing this kind of building are aimed at creating that superior outlook. Glass covering is a common feature for most of the beautiful building designs.\nThe other major benefit is a health benefit. It has been proven that sunlight makes a person healthier. Busking in the sun will grant you the opportunity to harness some vitamin D which is essential for the strengthening of the bones.\nTips When Finding A Professional Plumbing Contractor.\nA plumber should be contacted now since they deal with the following noble services. If your establishment requires the following plumbing utilities; you may hire a prominent plumber for fitting activities. In your apartment, you may be seeking proper fitting activities for the water pipes, taps and sewer lines so contact a distinctive plumbing agency for such noble operations.\nThey will also fit the best toilet flushing systems, drains, showers and the bathtubs on your bathroom. A plumber also deals with the repair services for the leaking water pipes, sewer lines and they also unblock the clogged drains. A reliable plumber will also offer maintenance services for different plumbing utilities so they can ensure they are in good condition.\nChoose a prominent plumber since they offer free advice on how to choose a precious plumbing utility. Take time to examine and research different plumbing entities based on their effectiveness and this will aid you in booking a magnificent plumbing agency. Search for the best plumbing agency from close friends and past clients since such Suburban Plumbing have been examined and proved in service.\nAlso, visit the local based plumbing contractor as they are meticulous, immaculate and preferred by many people in service. With the rise of many online-based plumbing contractors, one will find easy time knowing their features and attributes. View their frequently asked questions, their comments and all the reviews they’ve received from their past clients.\nBefore you choose a specific plumbing contractor, one needs to examine if they are wrapped up with the following attributes. Different plumbing contractors will serve you with their email address, phone numbers and website links so call today to prove they are legitimate in service. Again, choose a responsive plumbing contractor for they have a 24/7 operations to their customers.\nProve also if the plumbing contractor booked like Suburban Plumbing OC is licensed, certified and accredited for offering remarkable service to their clients. Bank on a licensed plumber like Suburban Plumbing OC so you can be assured of genuine, real and valid operations. Let the plumber serve you with all their credentials and testimonials to prove they’ve been specialized on how to offer services.\nThe essence of picking a trained and specialized plumbing agency like these plumbers is they are qualified and competent in service so they will follow the due process in the activities. Check if the plumbing agency chosen have accomplished different plumbing services for extended period for this makes them exposed in service. When an exposed plumber is booked one will be guaranteed of effective and exemplary service. A precious and enviable plumber will leave an indelible mark on their endeavors.\nIf it is a construction project is what you have at your hands then see to it that you will be hiring the right Los Angeles general contractor. You need to know though that there are many options that you can have in the market these days. It is important that you are able to consider some factors when looking for. Whenever it is these factors are what you will be considering then you will be guided in choosing the right one.\nOne of the things that you will need to do is to make a shortlist of a potential Los Angeles general contractor in your area. It is this one that you are able to do once you will be taking recommendations from people that you know. It can also help once you will be able to ask from building-supply stores in your area. Once you will be taking a look at online stores then it is them that can also provide you with vital information about Los Angeles general contractors in your area.\nIf it is a general contractor is what you are looking for then see to it that you will be looking at the qualifications that they have. You need to see to it that you will be choosing the one that has a license and sufficient insurance coverage. It is by ensuring these things that you will be protected. For you to be able to verify theses things then you can meet with the contractor in person. It is also important to know if they can comply with the work and scheduling requirements that you have. It can also help once they are able to provide you with references of satisfied customers.\nWhenever you are looking for a Los Angeles general contractor then make sure that you will be taking them for a quote. It is getting a good price that many people would want to do. The Los Angeles general contractor that will offer the lowest bid is to the one that you should choose. Having enough experience is not what these Los Angeles general contractors will have once they will be offering you the lowest bid. You will need to consider the quality that they can provide instead of just focusing on the price alone.\nWhenever you are looking for a Los Angeles general contractor then makes sure to always go for the one that is easy to work with. Looking at how they conduct themselves is a thing that you will need to do. It is also important that they will be able to approach the task in a professional manner. once it a Los Angeles general contractor is what you are looking for then make sure that they are committed on the project that you have. See to it that you will determine the amount of personal time that the Los Angeles general contractor will be able to allot for your project. It is them that should be able to provide timely, successful, and cost-effective completion.\nThere are many diagnoses taking place in most hospitals nowadays. Most of these diseases are believed to emanate from lack of observing our daily diet. You will note that some measures have been taken to help in relieving the rise of the conditions. One of the measures that have been of great help is the invention of the modern technology, which is used to carry out tests on these diseases as per this site. The advancement of this site is the main reason why it has been easy to minimize the number of conditions. Getting to know more about the diseases affecting women is possible if you take time to read on this article.\nIt is good to have adequate measures adopted to help in mitigating diseases affecting women. One effective way of ensuring the conditions affecting women are treated once and for all is to have the women imaging centers adopted. The beautiful thing with the imaging centers is the fact that you will have lost of diagnosis tests being carried there. Some of these womens imaging center flemington nj services include mammography, breast ultrasound, and breast biopsy, among others. Mammogram refers typically to the x-ray conducted on the breasts.\nOne effective way of ensuring the right detection of breast cancer is conducted is to go for the mammogram test. It is good to go for testing, and medical attention in case you realize there is a lump on your chest. Many women can detect breast cancer at early stages through going for mammogram tests. One effective way of ensuring that women can detect breast lump is to go for screening early. It is possible to have the right screening done on your breast if you decide to visit the right centers. Instant treatment needs to get started the moment one has realized they have breast cancer.\nReport indicates that only a few people have an understanding of the treatment of breast cancer. Visiting the right mammogram centers near me will help in treating breast cancer once and for all. Discovering that you have breast cancer first usually helps in treating it thoroughly. Early discovery of breast cancer is the best way to have one starting the treatment program. With this site that has been made with the advancement of technology, now the breast cancer can easily be detected through digital mammography. It is good to have the right procedures done on breast screening once you adopt the digital way.\nThe importance of event planning companies is highly recognized by professional organizations that constantly hold events at their companies. Most clients are faced with difficulties discerning the most qualified event planners from the multitude of the services providers with capabilities to offer services to match their needs. There are several factors that must be put in consideration when making a choice of the event planner to serve a company’s needs to their satisfaction. Clients should first evaluate their needs before committing to the search of the event planning companies because they will have an easy time shop talking for their need; however, it is crucial that a person choose a company that is able to provide services in all fields for efficiency and to make them an easy option for their future needs.\nFinding references from friends, family members, acquaintances or other people who have committed to the services of the event planners before is the most efficient way of beginning the search process. They can therefore look up about the best event planners from the online tools that are used by these companies to create awareness about their services and confirm their capabilities before committing to them. Once the client has enough references, they should go ahead and get more information about these companies and use it to discern the most capable event planners to hire for their needs.\nIt is essential that the clients choose a company with the best online reviews, the most post comments by preceding clients, the highest company ratings and the highest rankings since these are the ones that show the highest capabilities. There are many fields of expertise that different event planners prefer to involve in and therefore it is essential that a client also does a research to learn more about event planning expertise that the service provider they consider has. The experience of the event planning companies in service provision is also an important factor to ponder before committing to any service provider.\nSince a lot of car accident attorneys offer a free consultation; anytime one gets involved in an accident, that is the person you should consult because they will offer advice on the plan one should take. Getting an accident lawyer jacksonville fl helps people to receive the compensation that one needs since they can investigate your rights and claims which is useful in offering people the best services. If you search for attorney near me, there will be a lot of results about them so, find out what advantages one stands to fail by working with such people.\nOne should remember that using a such companies is the only way to make sure that people do not get overwhelmed during the process. In case one is working with a reputable attorney, they will help you to focus on the recovery process without worrying about anything else, thus making sure that things fall into plan as one would have expected.\nAttorneys never compromise, and if you are interested in finding florida workers compensation lawyer, they will be useful in ensuring people get the ideal settlement that best suits you. An experienced attorney will be useful because they know ways to negotiating until the settlement suits you.\nThe legal process only looks complicated when one is not working with an expert; therefore, it is best to ensure that the legal process is easy so these people can take over the bulk of the work one is dealing with, thus making sure that the negotiation process and anything else that is done perfectly.\nA lot of times, clients lose confidence in the legal process, and when one lacks motivation, it means that people end up settling for less value and avoid getting frustrated.\nWhen one is working with insurance companies the settlement process could take weeks or months which could be quite frustrating; therefore, choosing attorney speeds up the process as one can see in this link.\nWhen one is looking for Floridas Personal Injury Lawyer; there are a lot of things that people can find regarding the team so, see ore here on how one can nave time and still get effective results. A person should settle for the right plan and ensure that there is someone you can rely on at all times.\nFeatures Of A Professional Mold Inspection Firm.\nWith the rise of mold inspection firms, it’s always peculiar to compare and scrutinize their details for ease of knowing if they are effective and worthy. Mold inspection agencies will examine and inspect if there is mold in your homes and then draw up the best strategy to deal with such molds. Mold inspection agencies should be booked based on their effectiveness so when you are free, visit them on their local areas for consultation services.\nWhen searching for a reliable and fabulous mold inspection firm, you need to visit those referred and redirected to you by their past clients since they are meticulous. Remember most of the appealing and fabulous mold inspection firms have embraced online marketing operations, so you need to embrace them from their sites. click for more on the frequently asked questions and the reviews the online-based mold inspection firms have on their websites, and this will enlighten you.\nThere are many issues you need to consider now when finding a competitive mold inspection service provider as outlined in the following context. Let the mold inspection firm prove they are accessible and legitimate through their email and phone numbers. There is also need to hire a responsive mold inspection and remediation company as they are worthy and offer 24/7 operations to their customers.\nWhen dealing with a specific mold inspection company, you need to check on their operational permits and other documents to prove these agencies are real and enviable. If the mold inspection company booked have been verified, validated and accredited for service by the local administration, hen such agencies deserves a tick for they are protective of their customers. Always bank on a trained, educated and specialized mold inspection firm since they are well versed on how to offer immaculate service.\nThe benefit with these mold remediation toronto is they are qualified and competent in service, up to the task and therefore reliable always. A long time serving mold inspection company ought to be booked for they have worked with different clients. Booking any experienced and long-time serving mold inspection firm is enviable for these entities are knowledgeable and have pertinent tricks in service.\ndiscover more also if the mold inspection fir have invested more about their professional dealings and this confirms they are ready and willing to give their best. A reliable and timely mold inspection agency ought to be considered for service since they don’t back down when serving their customers. Again, a high quality-oriented mold inspection firm should be considered as this shows they are thrilling, thriving and exceptionally viable.\nWhen you are in a situation where you have financial distress, the best thing that you can do is to look for a solution. One of the best things that you can do in order to sort this out is by working with companies that will help you. You can be able to get good results if you decide to take loans, that is how you get the money that you need. The availability of companies that will help you is a good thing and therefore, you just have to work with them. You’ll be able to get help from these companies simply because they are willing to and, they will not care about your financial situation. The fact that they usually operate from online platforms is one of the reasons why you should work with them. Getting to contact them is not going to be a difficult process, it’ll be very easy for you. Provided you are in the jurisdiction, they are always able to provide you with the amount of money that you need and the application will be done online. When you work with them you’ll be able to get all the following advantages.\nThe first advantage of getting personal loans from these companies is that they’re going to provide you with these loans regardless of your credit score. They’re going to provide you with about $500 that you need very easily once you give them that opportunity. However, most of these companies are also very careful about giving you the money especially if you are faithful about your repayment. Helping you to have your credit score sorted out is also one of the biggest things that the short term loans for bad credit companies are going to do for you. With many of the projects that you’re doing, the company is going to sort you out, discover more. First Financial does not ask for too much information when doing the application, it is very easy to apply for the loan. First Financial will never take a long time to approve the first loan that you ask from them, they will be very easy to approve. With most of these emergency loans companies, you can even get the personal loan that you wanted the same day.\nService dog training plays the best role in so many ways. Everything is quite good when you focus to have the better part of this. It is ever easy when you manage to discover more. You could find out how well you will note the training of the dog near you. It shall now be grateful since you find the better part of what you need. There are better chances for training your dog. Thee are the most effective relations that will be impressed. The dog can be relating well to people. Such skills will be good for your dog. Getting to train your dog has the following benefits as shown in this site.\nThe owners of the dog relate well. By training the dog you can increase its good relations. You cannot manage to do it. You can prepare to have the schedule for dog training California to know what to do. If you happen to own a good dog then you better plan for the training. This is the unique and useful criteria for making everything to work out well. You could find some details on this article to help you. With the details there is much that you could prefer to be doing for your dog. You are thus advised to check it out on the best that you may do by hiring the company.\nReduces cases of harm to humans. A dog can bite and harm people. With the act of choosing the training services you can have the best plan on this. You can plan and browse this website to help you know what you can do. Once you manage to see more here it is very easy for you. Your dog will be enjoying the best skills. The skills will help n eliminating the act of having to harm your dog. You may be enjoying to have the best part of training your dog by having the training services. You could be getting more when you manage to deal with the best training of the dog in the school.\nGetting the best dog training schools near me will bring out the best dog behavior. This can help you improve the general behavior of the dog. Choosing the best training for your dog is quite good. There is more than you can find out how to do in this article. You can have everything fixed in a better way. It is the better way in which it could aid you. It as well possible to have all in this link."} +{"added":"2023-04-07T03:31:54.931204+00:00","created":"2020-06-05T02:52:10Z","id":"http://busineserviceny.info/improve-your-leadership-skills-by-using-these-ideas/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T02:52:10Z","digest":"sha1:XQDXTQEKJ2P2UL5UC4KHOAQD55VSSAQV","language":"en","language_score":0.96,"length":3444,"line_ids":[18,19,20,21,22,23,24,26,27,28,29,39,44,48,94,95],"nlines":16,"original_length":6909,"original_nlines":117,"perplexity":280.4,"provenance":"cc_en_head-0202.json.gz:24","source_domain":"busineserviceny.info","title":"Improve Your Leadership Skills By Using These Ideas – BUSINESS-service-ny","url":"http://busineserviceny.info/improve-your-leadership-skills-by-using-these-ideas/"},"source":"common-crawl","text":"Since being a leader means others look to you for guidance, becoming one may be difficult. There’s no one right way to be a good leader; it takes a lot of skills in a lot of different areas. Learn the skills you do not have to become a better leader and work on the skills you do have.\nLive the vision of your team. Create a mission statement that helps all employees understand your vision for the company. It is important that you communicate the bigger picture while you help your team see how their roles play a part in the big ideas. It is a great way to offer direction and build a healthy professional relationship.\nHonesty is a crucial leadership characteristic. Good leaders should be trustworthy. As you work to develop your leadership skills, attempt to always set a precedent of honesty and trustworthiness. People will respect you in a leadership role if they know you’re reliable and trustworthy.\nLook for talent in your team. When you are in the process of looking for people to help you, it should be simple for you to decide who would give you the best benefit. That applies both to contracting people and hiring them.\nBe open about any issues that arise in a timely manner. Great leaders used to cover up problems; that isn’t the case anymore. What is the reason for that? There is a lot of communication in the world today. What’s done in the dark comes out in the light eventually. Instead of reacting, control. This is the path of true leadership.\nWhen dealing with employees and customers it is so important to be ethical. Every sound business must be ethical. Customers will keep coming back if they can trust you. When your team is expected to follow a moral code, the rules are probably going to be followed easily.\nHire people from different backgrounds to help you. All types of people can give your organization many different ideas from which to draw on. This will limit the amount of innovation in your business. It can also make a company fail.\nLearn to listen. If you wish to be a great leader, that starts with listening. Listen to what your workers are saying. Address both their concerns and anything positive they have to say. You can learn something from anyone, no matter who they are. You will be shocked at what you learn by listening.\nYou must be able to communicate effectively and clearly with others. Make sure your team has the information they need to complete their tasks, including the deadlines. Check in here and there to make sure everyone is on track.\nLeadership is about developing methods you can use to effectively guide others to greatness. Using what you have read here will make a big difference. Stay humble and always be open to learning new things.\n"} +{"added":"2023-04-07T03:31:56.011971+00:00","created":"2020-06-05T04:26:23Z","id":"http://cempaka-tourist.blogspot.com/2012/01/","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T04:26:23Z","digest":"sha1:IIKPCAQZ7WIPJUHI3H43THXWCSL743JH","language":"en","language_score":0.97,"length":33474,"line_ids":[166,167,168,169,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,219,223,225,226,227,228,229,230,231,232,233,234,235,236,237,240,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,297,299,300,301,302,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,328,330,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,365,366,367,368,371,374,375,376,377,378,379,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,398,399,403,404,405,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,437],"nlines":198,"original_length":95171,"original_nlines":1501,"perplexity":306.7,"provenance":"cc_en_head-0202.json.gz:27","source_domain":"cempaka-tourist.blogspot.com","title":"Cempaka Culture and Tourism: January 2012","url":"http://cempaka-tourist.blogspot.com/2012/01/"},"source":"common-crawl","text":"Two years ago, eight young Indonesian women came together for a project. Although at first it seemed small, the endeavor would end up having a bigger impact on their lives than they ever imagined.\nIn a country where a declaration of homosexuality at best raises eyebrows and at worst incites blind hatred and prejudice, “Srikandi” is a brave movie that sheds light on the lesbian, gay, bisexual and transgender (LGBT) community here.\nIt is especially unique because the filmmakers themselves are part of that community and were able to honestly convey their own experiences, feelings and thoughts on the screen.\nStea Lim, who is both a director and executive producer of the film, said the project was initiated by Laura Coppens, a German visual anthropologist who wanted to make a film about the struggles of Indonesian women. The project was realized with the help of the Goethe-Institut, the German cultural organization, and In-Docs, the documentary program of the Society of Independent Indonesian Films (YMMFI).\nCoppens said she began traveling to the region and Indonesia on a regular basis about five years ago.\n“This was basically because I am an Indonesian film curator and festival programmer for Asian Hot Shots Berlin, a festival for Asian independent cinema I founded with two friends in 2007 in Berlin,” she said.\nAfter becoming more involved with the regional cinema scene, Coppens realized there was a general lack of films about homosexual women, and there certainly weren’t any examples from Indonesia.\nShe found support from Berlin-based filmmaker Angelika Levi, who became a mentor and editor for the film. Together, the two have seen the project through from beginning to end.\n“She is an experienced filmmaker and a wonderful storyteller. It is mainly also because of her involvement and engagement that our film is as successful and beautiful as it is now,” Coppens said of Levi.\nThe diversity of the group members was key, Stea said.\nEight women, eight films. But the participants did more than simply work on their own stories. They acted in one another’s films, helped with narrative development and served as crew members.\nSrikandi not only makes an appearance in the title of the film, she also plays an active part in it: her story is told between the eight short films through Indonesian shadow theater scenes performed by puppeteer Soleh and singer Anik, two male-to-female transgender individuals from Surabaya.\nFor the women involved in the project, making “Srikandi” was a unique experience. Many of them had never done anything like try to make a movie.\nOne of the women is Edith, a political science student in Yogyakarta. She said she enjoyed being part of the project.\n“It was very exciting for me to learn how to make the storyboard, to think about the visuals, the images and the sounds, and how it all needs to be connected,” she said, adding that she could even imagine continuing with filmmaking in the future.\nAnother woman involved in the project, Winnie, looks at relationships and some of the typical prejudices she sees in society.\nThe filmmakers are well aware that “Srikandi” might draw criticism, but so far, the feedback has been positive.\nFriends who couldn’t afford to support the project with cash found other ways to help, donating time and energy to do things like assist with the lights or participate on the camera team, she added.\nWhile the film’s final cut has been completed, there is still post-production work to be done, primarily with sound and color. The group is still looking for funding to finance this final part of the project.\nThe group’s main motivation for completing “Srikandi” is to provide a platform for people to discuss a topic that is often deliberately overlooked.\nIt is undeniable that the film has already made an impact. In Germany it was selected as an official entry for the Panorama program at the Berlinale, Berlin’s international film festival, which will take place from Feb. 9 to 19.\nIn addition to “Srikandi,” the Berlin festival will showcase two other feature films from Indonesia: “Postcards From the Zoo” by Edwin and “The Mirror Never Lies” by Kamila Andini, as well as the short film “7 Deadly Kisses” by Sammaria Simanjuntak.\nThe eight “Srikandi” women welcomed the news about the Berlinale with much excitement and joy.\n“We are obviously ecstatic about it,” Stea said, adding that it was also quite overwhelming.\nJakarta (ANTARA News) - United Arab Emirates (UAE) Parliament Speaker Mohammad Ahmad Almour has praised Indonesia as an exemplary OIC (Organization of the Islamic Conference) member country for its consistent adherence to democracy and tolerant Islamic culture.\nThe West had often said that Democracy and Islam could not go hand in hand and that the two could even come into conflict with each other, said Hidayat citing Almour`s statement at the Islamic parliamentary meeting in Palembang.\n\"Although the majority of its population is Muslim, Indonesia consistently implements democracy and it is therefore the pride of OIC,\" Hidayat quoted Almour as saying.\nHidayat also said the United Arab Emirates parliament hoped Indonesia could play a more significant role in straightening out the West`s negative view of Islam in relation with democracy.\nTherefore many misunderstanding views about Islam can be reduced and Islam is not identical with terrorism.\n\"Indonesia is famous for treating quests in friendly, courteous and respectful ways. The chairman of the Parliament of UAE really appreciates this,\" said Marzuki Alie, Speaker of Indonesia`s House of Representatives on the same occasion.\nArab countries want the stigma of Islamic terrorism can be eliminated through Indonesian good relations with Europe and America.\n\"They thought Indonesia could be an agent for having good relations and appreciated by the US and Europe,\" he said.\nIndonesia can show that democracy is not contrary to Islamic values and Islam is not terrorist, but the violence arises recently only committed by a small group of extremists that does not represent Islam.\nA 38-year-old Indonesian security guard has been charged with assault after kicking a 20-year-old woman in the face after mistaking her for a predatory ghost.\nThe guard was on duty in Bandung, Java, when \"suddenly a figure, who was sitting on the floor and clad in white with hair covering the face, moved slowly in a weird way toward the elevator\", according to the Jakarta Post.\nThe guard \"sensed a threat to himself and the others in the elevator\" and kicked the ghost to the floor \"so that everyone ... could run and save themselves,\" the security officer's lawyer told the Jakarta Post.\nBut guard Sunarya's seemingly noble deeds have landed him more punishment than accolades, as the \"ghost\" turned out to be the 20-year-old daughter of a coal businessman who was in the hotel playing a \"ghost trick\" for a friend's birthday. She was later hospitalised for a bruised face and broken tooth.\nSunarya now faces charges of assaulting the girl, who was dressed as suster ngesot, a \"crawling nurse ghost\".\nThe Indonesian archipelago, while predominantly Muslim, is strongly influenced by spiritual and supernatural myths from its Malay, Hindu and Buddhist heritage. Common ghost sightings often refer to floating female figures dressed in white, with long hair and, sometimes, long fingernails.\nGhost and vampire movies are wildly popular in Indonesia (some erotic, many others homemade), but supernatural action is not confined to the screen. In December, at least 10 people were reportedly possessed by spirits after visiting a mahogany tree in Jakarta said to house a pocong (shrouded corpse ghost) and kuntilak (female vampire ghost). The tree had to be cut down and the religious affairs minister called in to calm hysterical crowds, but the spirits were reported to have merely moved on to a neighbouring tree.\nAs supernatural creatures in Indonesia can often take on terrifying forms – such as the jenglot, a mummified vampiric doll with fangs and long hair who feeds off human and animal blood – it is not surprising that many Indonesians approach ghosts with some trepidation.\nAs one reader, following Sunarya's story, wrote to the Jakarta Post: \"Sunarya should receive a medal, or at least be showered with honours, for acting so courageously in facing what he initially thought was a real ghost.\"\nBut the security officer remains modest. \"I have a wife and two little children, I hope everything will be fine,\" he told reporters. \"I just did my job as a security guard.\"\nAmid a wealth of off-field missteps and political bickering, Stefano Lilipaly provided a bit of good news for Indonesian football on Sunday.\nThe 22-year-old midfielder scored in the 66th minute of his debut for FC Utrecht in the Dutch top flight, becoming the first Indonesian to find the net in the Eredivisie.\nUtrecht drew 1-1 with visiting PSV Eindhoven after Ola Toivonen equalized in the 71st minute.\nHe played for the Netherlands’ Under-15 and Under-18 national teams, but he looked to his roots when the Indonesian Football Association (PSSI) put out a call for talented youngsters of Indonesian heritage playing in Europe.\nStefano, whose father is Indonesian, was granted citizenship on Oct. 11, 2011, alongside fellow Dutch-born Jhonny Van Beukering and Tony Cussel and Nigeria-born Greg Nwokolo and Victor Igbonefo.\nHe was called up for Indonesia’s Under-23 tryouts to play in last year’s Southeast Asian Games, but he was cut by then-coach Rahmad Darmawan.\nUtrecht coach Jan Wouters praised the young midfielder for his development.\n“Lilipaly has developed very well, grown through the youth team and has shown his quality to us in this game,” he said.\nHowever, Aji Santoso, Indonesia’s new senior and U-23 head coach, said he did not want to rush in calling up Stefano for the national team’s next match – the final 2014 World Cup qualifier at Bahrain on Feb. 29.\nIndonesia is last in Group E with no wins and 16 goals conceded from five matches. Its last chance to reach the next round passed with a 4-1 home loss to Iran in November.\n“The Bahrain game will be our last match in the group and the results won’t decide anything. I prefer to call Stefano up for future events, such as the Suzuki Cup and the 2013 SEA Games,” Aji told the Jakarta Globe on Monday.\nStefano is one of a small but growing group of Indonesians playing in Europe. Yericho Christiantoko, Alfin Tuasalamony, Yandi Munawar and Syamsir Alam play in the Belgian second division with CS Vise, which is owned by the Bakrie family, while Arthur Irawan is trying his luck in Spain after signing with Espanyol’s U-19 team last year.\nIf those youngsters kept improving and played regularly, Aji said, they would help provide a brighter future for the national team. With world governing body FIFA prohibiting the PSSI from calling up players competing in the breakaway Indonesian Super League, the 41-year-old coach needs every option he can find.\n“I would love to see them playing in my team. I don’t think it will happen when we play against Bahrain, but it will be in the near future,” he said.\nDutch giants PSV.\nThey are holidays billed as an opportunity to enter another world, a chance to see the world's last primitive tribes up close in their natural environment.\nThe brochures tease and at times, critics say, titillate. Take the Delhi-based Aces Indian Tours, which invites visitors to travel to see the Bonda people, an ancient tribe found in the remote hilly regions of the state of Orissa. The website breathily offers to provide an insight into utterly different lives. \"On the northwest of river Machkund\", it states, \"live the wildest, rudest and possibly the most interesting tribe known as Bonda Tribe. The scanty dress of the Bonda women and homicidal tendency of Bonda males make them most fascinating people.\"\nIt is this kind of exotic invitation that has now come under unprecedented scrutiny in India, raising ethical issues that also apply to similar tours in other remote regions of the world.\nEvery year, thousands of western tourists visit India in search of the exotic. But two weeks after an Observer investigation exposed the degradation of \"human safaris\" in the Andaman Islands – which are in Indian territory – the country's travel industry has entered a bout of soul-searching. The Observer exposed video evidence that Jarawa tribeswomen had been bullied into dancing for convoys of visitors on the islands' main road. The reaction has been furious. Sonia Gandhi, leader of the ruling Congress party, has taken a personal interest in the 400-strong Jarawa's fate and is understood to be frustrated by the lack of action to protect them. At a meeting last week of the powerful National Advisory Council, which she chairs, members denounced the \"deplorable situation of the exploitation of the Jarawa tribe\".\nThe investigation has now prompted the home minister, P Chidambaram, to demand the interrogation of those responsible. He has flown to the Andamans to tell officials there to act swiftly to prevent further abuse of the tribe. Tribal affairs minister V Kishore Chandra Deo said: \"It's deplorable. You cannot treat human beings like beasts for the sake of money. Whatever kind of tourism is that? I totally disapprove and it is being banned.\"\nBut what about elsewhere in India, and in other parts of the world? In the case of the Jarawa, there is little doubt that the tribespeople have been exploited by unscrupulous locals and insensitive visitors. Elsewhere, ethical lines are usually more blurred, but the risk of damaging contact with vulnerable communities is very real.\nIn 1989, India introduced the Scheduled Caste and Scheduled Tribe (Prevention of Atrocities) Act in an attempt to protect indigenous communities from \"indignities, humiliations and harassment\". But with the number of tourists across the world expected to hit a billion this year and previously inaccessible places beginning to open up for more and more people, how close should travellers be allowed to come to vulnerable communities?\nOne of the most popular destinations in India is Orissa, where tour operators promise sightings of the insular Bonda when the tribespeople leave their homes to go to a market in the village of Onkadeli. Entry into the Bonda's own villages is illegal: the tribe do not invite attention and many dislike being photographed, but the market offers a way to view them, and the tours continue. Even the state tourist board uses images of the tribe in its adverts.\nRoyal India Holidays, with offices in India and the US, offers a tribal tour of Orissa where it promises tourists can \"see the lifestyle of tattooed, heavily beaded, nearly naked tribal people, their day to day activity and their extremely primitive way of living\".\nThe company says it is recognised by India's ministry of tourism. Its brochure describes a trip to the Jeypore area of Orissa: \"After breakfast, an excursion to the nearby hills where most amazing and fierce Bondas Tribes (naked people) reside. They are also known as Remo meaning 'people'. The Bonda are generally semi-clothed, with the women characterised by the wearing of thick silver necklace bands. The tribe is one of the oldest and most primitive with their culture little changed in over a thousand years. The best way to view members of the tribe is by going to local markets held every Thursday.\"\nCompany owner Newton Singh told the Observer that most tours in Orissa focused on the tribes. He said he believed that the company was operating within the law but it relied on local agents in Orissa. He said he understood the objections and would review the tours. \"I don't want to do anything against the laws of humanity,\" he said.\nBhubaneswar-based Dove Tours offers a tour of Orissa in which, it explains, \"the area we visit is the home of the approximately 6,000 members of the fierce Bondas (naked people). They live in the remote hills and keep themselves isolated … They can only be seen when they come to trade at the local market, and we must time our visit to coincide with the weekly market day.\"\nGagan Sarangi, speaking for the company, said it operated within responsible guidelines and only dealt with tourists who would respect the tribe. He said foreign tourists were barred from the Bonda's area, but it was still possible to spot them in the markets. \"We are totally against any kind of unethical practices in the tribal area,\" he said.\nSurvival International, which campaigns on behalf of tribal peoples, is sceptical that such a high-minded approach is reflected by what takes place on the ground. The charity's director Stephen Corry said: \"We are now in the 21st century, not the 19th. Colonialism should be a thing of the past. Tribes are not cultural relics, nor should they be treated like animals in a zoo.\n\"They are not ancient or backward, but adapting like everybody around us to a changing world. This should entitle them to the same rights and freedoms as the very tourists who are taking their photographs.\n\"Promoting tours by using derogatory terms such as 'primitive', and advertising their 'nakedness', shows a clear lack of respect.\"\nCorry said tour operators had no right to promote tribal people as a tourist attraction. \"Forcing them to dance in return for sweets and biscuits, for the amusement of onlookers, is only possible where they're viewed as somehow less than fully human,\" he said. \"Sadly, the existence of human safaris in the Andamans is not isolated, but replicated in other areas of India. It is crucial tourists boycott such unethical 'attractions', so there is no fuel in the market to drive such tasteless practices.\"\nAssociation of British Travel Agents spokesman Sean Tipton said the travel industry recognised the necessity of regulating and monitoring interaction between tourists and tribes. \"Customers should always seek permission before taking photographs of indigenous people, or indeed any local people. This is a matter of courtesy and cross-cultural sensitivity.\"\nSue Ockwell, for the Association of Independent Tour Operators, was also keen to emphasise that its member companies were against the sort of \"tacky tourism\" seen in the Andamans. \"The aim of Aito is to ensure that host destinations and local people at tourism destinations benefit from tourism as well as those who take tourists,\" she said.\n\"Unfortunately, exploitation does still occur – from child prostitution to the type of practice described in the Andamans. It requires action by UK tour operators and associations such as Aito and Abta and it also requires action by governments in the destinations affected. It is only by working together that this sort of business can be stamped out.\"\nClearly, though, more needs to be done. Blogging about a visit to Onkadeli, one tourist noted that the attention of tourists was clearly unwelcome to some of the tribal people: \"There were a few tourists around (including myself) and truth be said, it all felt a little rude and intrusive! Some of the adivasis [indigenous tribes] were clearly uncomfortable with camera-wielding tourists, so I started to only take pictures with their permission. This would almost always result in my having to part with 10 rupees [about 13p]!\"\nBritish travel firm Audley Travel, winner of three Guardian-Observer travel awards for best small tour operator, offers tours to Orissa in which it promises sightings of the Bonda tribe, despite acknowledging that photographing them may be banned. The company said it firmly advocated responsible tourism and only took individuals or couples.\n\"They are accompanied by tour guides who are well briefed on the cultural sensitivity of the situation, particularly photography. Our clients themselves are intelligent, informed travellers whose last intention would be to 'gawp' at local people.\"\nMeanwhile, in the Andaman Islands, police say they have made some progress in identifying those responsible for filming the video publicised by the Observer, which is now believed to have been shot in September or October of 2008. \"The police team is questioning several tour operators and taxi drivers of the city in an effort to track down the source of the video,\" said S B Tyagi, superintendent of police. Officers have raided several shops in the town in an attempt to seize videos of the Jarawa that have been circulating among tour operators and which are sold to tourists.\nPolice announced on Thursday that a senior officer had been placed in charge of monitoring the road and said they had arrested two tour operators named in the Observer report.\nThe lieutenant governor of the Andaman and Nicobar Islands, Bhopinder Singh, has been ordered to take action to prevent further exploitation of the Jarawa. Tribal affairs minister Krishna Chandra Deo described the incident as \"disgraceful and a shame on all of us\" and said the tribe should be treated as humans, not aliens.\nAs the country digests the implications of the scandal, Corry said tourists considering visiting tribal areas needed to think very carefully about the long-term effects on tribal peoples, instead of the \"fleeting thrill of the experience or the glory of the story once back home\".\nA high-profile member of Indonesia’s large transgender community has registered to become a member of the National Commission on Human Rights (Komnas-Ham).\nYulianus Rettoblaut, known as Mami Yuli or Mother Yuli, said on Friday that she was seeking to become a commissioner to fight for the rights to Indonesia’s ‘waria’ community, an Indonesian word taken from wanitia (woman) and pria (man).\nSpeaking at Komnas-Ham in Central Jakarta, Yuli said she would also fight for the rights of gay and lesbian communities as well as other heavily marginalized groups.\nYuli, head of the Communication Forum of Indonesia Transgender, said that high on her agenda was to allow transgender people access to school for a formal education to increase their career prospects.\nYuli has a law degree from Universitas Islam At Tahiriyah.\nAnother key issue was to fight for the rights of elderly transgender people who were often homeless and left to fend for themselves.\n“We will teach them skills such as baking cakes and other things so they can live independently and be productive,” Yuli said.\nFellow transgender activist Merlyn Sopjan said many transgenders were forced to quit school because they were mocked.\nShe said that while transgender people were treated better in the big cities, discrimination was pervasive.\nAn Indonesian civil servant who posted “God does not exist” on his Facebook page has been taken into police custody for his own protection after he was badly beaten.\nThe man, identified as Alexander, 31, now faces the prospect of losing his job, or even being jailed, if he fails to repent and accept one of six official state religions.\nBlasphemy carries a maximum sentence of five years in jail.\nLocal media reported that when Alexander arrived at work at the Dharmasraya Development Planning Board (Bappeda) on Wednesday, a group of men, also understood to comprise government officials, attacked and beat him before police arrived and took him into protective custody.\nAlexander was quoted by Padangekspres.co.id as saying that he did not believe in God because of the amount of crime and disasters.\nAlexander said he was born a Muslim but ceased religious activities in 2008.\nAziz said police would wait for a recommendation from the West Sumatra Coordinating Agency to Supervise Religion and Beliefs (Bakorpakem) as well as the Ministry of Religious Affairs before deciding on further action against Alexander.\nGusrizal Gazahar, head of the West Sumatera chapter of the Council of Ulema (MUI), told local media that if he refused to repent, Alexander should lose his job.\n“I want him to be fired,” he said.\n"} +{"added":"2023-04-07T03:31:56.038327+00:00","created":"2020-06-05T04:29:51Z","id":"http://cempaka-tourist.blogspot.com/2017/07/aborigines-in-australia-longer-than.html","metadata":{"bucket":"head","cc_segment":"crawl-data/CC-MAIN-2020-24/segments/1590348492427.71/wet/CC-MAIN-20200605014501-20200605044501-00150.warc.wet.gz","date_download":"2020-06-05T04:29:51Z","digest":"sha1:RNY4ZPQD7ZKQDEVHCJZY57VCWFEHAHJF","language":"en","language_score":0.95,"length":4839,"line_ids":[166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,200,203,1125,1126,1127,1128,1129,1130],"nlines":41,"original_length":53732,"original_nlines":1259,"perplexity":305.5,"provenance":"cc_en_head-0202.json.gz:28","source_domain":"cempaka-tourist.blogspot.com","title":"Cempaka Culture and Tourism: Aborigines in Australia longer than previously thought: study","url":"http://cempaka-tourist.blogspot.com/2017/07/aborigines-in-australia-longer-than.html"},"source":"common-crawl","text":"Aborigines in Australia longer than previously tho...\nSurf's up! N. Korea tourism agency tries to woo fo...\nGrand Imam of Istiqlal Mosque Calls for Preservati...\nWorld-Class Tourist Attractions and Nostalgia Brin...\nObama calls for tolerance and unity in childhood h..."}