diff --git a/packages/blake3-wasm/.npmignore b/packages/blake3-wasm/.npmignore new file mode 100644 index 0000000000..5657f6ea7d --- /dev/null +++ b/packages/blake3-wasm/.npmignore @@ -0,0 +1 @@ +vendor \ No newline at end of file diff --git a/packages/blake3-wasm/LICENSE_A2 b/packages/blake3-wasm/LICENSE_A2 new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/blake3-wasm/LICENSE_A2 @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/blake3-wasm/LICENSE_A2LLVM b/packages/blake3-wasm/LICENSE_A2LLVM new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/blake3-wasm/LICENSE_A2LLVM @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/blake3-wasm/LICENSE_CC0 b/packages/blake3-wasm/LICENSE_CC0 new file mode 100644 index 0000000000..1625c17936 --- /dev/null +++ b/packages/blake3-wasm/LICENSE_CC0 @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. \ No newline at end of file diff --git a/packages/blake3-wasm/README.md b/packages/blake3-wasm/README.md new file mode 100644 index 0000000000..944cca112e --- /dev/null +++ b/packages/blake3-wasm/README.md @@ -0,0 +1,25 @@ +JS and WASM implementations of https://github.com/BLAKE3-team/BLAKE3 + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { blake3, blake3Hex, createHasher, update, finalize } from '@huggingface/gearhash-wasm'; + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1_000_000); // Example: 1MB of data +// ... fill data with your content ... + +const hashUint8 = blake3(data); +const hashHex = blake3Hex(data); + +// Or streaming fashion +const hasher = createHasher(); + +for (const chunk of dataSource) { + hasher.update(chunk); +} + +const hash = hasher.finalize(); +``` \ No newline at end of file diff --git a/packages/blake3-wasm/asconfig.json b/packages/blake3-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/blake3-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/blake3-wasm/assembly/blake3.ts b/packages/blake3-wasm/assembly/blake3.ts new file mode 100644 index 0000000000..e3c346b11a --- /dev/null +++ b/packages/blake3-wasm/assembly/blake3.ts @@ -0,0 +1,387 @@ +// Constants from the reference implementation +const OUT_LEN: i32 = 32; +// const KEY_LEN: usize = 32; +const BLOCK_LEN: i32 = 64; +const CHUNK_LEN: i32 = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +//const KEYED_HASH: u32 = 1 << 4; +//const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: StaticArray = [ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +]; + +const MSG_PERMUTATION: StaticArray = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +function g(state: StaticArray, a: i32, b: i32, c: i32, d: i32, mx: u32, my: u32): void { + state[a] = state[a] + state[b] + mx; + state[d] = rotr(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + my; + state[d] = rotr(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr(state[b] ^ state[c], 7); +} + +function round(state: StaticArray, m: StaticArray): void { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +function permute(m: StaticArray): void { + const permuted = new StaticArray(16); + for (let i = 0; i < 16; i++) { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + for (let i = 0; i < 16; i++) { + m[i] = permuted[i]; + } +} + +function compress( + chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 +): StaticArray { + const counter_low = counter as u32; + const counter_high = (counter >> 32) as u32; + const state = new StaticArray(16); + + // Initialize state + for (let i = 0; i < 8; i++) { + state[i] = chaining_value[i]; + state[i + 8] = IV[i]; + } + state[12] = counter_low; + state[13] = counter_high; + state[14] = block_len; + state[15] = flags; + + const block = new StaticArray(16); + for (let i = 0; i < 16; i++) { + block[i] = block_words[i]; + } + + // Apply rounds + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + permute(block); + round(state, block); + + // Final mixing + for (let i = 0; i < 8; i++) { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + + return state; +} + +function words_from_little_endian_bytes(bytes: Uint8Array, words: StaticArray): void { + for (let i = 0; i < words.length; i++) { + const offset = i * 4; + words[i] = + bytes[offset] | + ((bytes[offset + 1] as u32) << 8) | + ((bytes[offset + 2] as u32) << 16) | + ((bytes[offset + 3] as u32) << 24); + } +} + +class Blake3Hasher { + private chunk_state: ChunkState; + private key_words: StaticArray; + private cv_stack: StaticArray>; + private cv_stack_len: u8; + private flags: u32; + + constructor() { + const key_words = new StaticArray(8); + for (let i = 0; i < 8; i++) { + key_words[i] = IV[i]; + } + this.key_words = key_words; + this.chunk_state = new ChunkState(key_words, 0, 0); + this.cv_stack = new StaticArray>(54); + this.cv_stack_len = 0; + this.flags = 0; + + for (let i = 0; i < 54; i++) { + this.cv_stack[i] = new StaticArray(8); + } + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.chunk_state.len() == CHUNK_LEN) { + const chunk_cv = this.chunk_state.output().chaining_value(); + const total_chunks = this.chunk_state.chunk_counter + 1; + this.add_chunk_chaining_value(chunk_cv, total_chunks); + this.chunk_state = new ChunkState(this.key_words, total_chunks, this.flags); + } + + const want = CHUNK_LEN - this.chunk_state.len(); + const take = min(want, input.length - inputPos); + this.chunk_state.update(input.subarray(inputPos, inputPos + take)); + inputPos += take; + } + } + + finalize(out: Uint8Array): void { + let output = this.chunk_state.output(); + let parent_nodes_remaining = this.cv_stack_len; + + while (parent_nodes_remaining > 0) { + parent_nodes_remaining--; + output = parent_output( + this.cv_stack[parent_nodes_remaining], + output.chaining_value(), + this.key_words, + this.flags + ); + } + + output.root_output_bytes(out); + } + + private add_chunk_chaining_value(new_cv: StaticArray, total_chunks: u64): void { + let mut_new_cv = new_cv; + let mut_total_chunks = total_chunks; + + while ((mut_total_chunks & 1) == 0) { + mut_new_cv = parent_cv(this.pop_stack(), mut_new_cv, this.key_words, this.flags); + mut_total_chunks >>= 1; + } + + this.push_stack(mut_new_cv); + } + + private push_stack(cv: StaticArray): void { + for (let i = 0; i < 8; i++) { + this.cv_stack[this.cv_stack_len][i] = cv[i]; + } + this.cv_stack_len++; + } + + private pop_stack(): StaticArray { + this.cv_stack_len--; + return this.cv_stack[this.cv_stack_len]; + } +} + +class ChunkState { + chaining_value: StaticArray; + chunk_counter: u64; + block: Uint8Array; + block_len: u8; + blocks_compressed: u8; + flags: u32; + + constructor(key_words: StaticArray, chunk_counter: u64, flags: u32) { + this.chaining_value = new StaticArray(8); + this.chunk_counter = chunk_counter; + this.block = new Uint8Array(BLOCK_LEN); + this.block_len = 0; + this.blocks_compressed = 0; + this.flags = flags; + + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = key_words[i]; + } + } + + len(): i32 { + return BLOCK_LEN * this.blocks_compressed + this.block_len; + } + + start_flag(): u32 { + return this.blocks_compressed == 0 ? CHUNK_START : 0; + } + + update(input: Uint8Array): void { + let inputPos = 0; + while (inputPos < input.length) { + if (this.block_len == BLOCK_LEN) { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + const compressed = compress( + this.chaining_value, + block_words, + this.chunk_counter, + BLOCK_LEN, + this.flags | this.start_flag() + ); + for (let i = 0; i < 8; i++) { + this.chaining_value[i] = compressed[i]; + } + this.blocks_compressed++; + this.block = new Uint8Array(BLOCK_LEN); + this.block_len = 0; + } + + const want = BLOCK_LEN - this.block_len; + const take = min(want, input.length - inputPos); + for (let i = 0; i < take; i++) { + this.block[this.block_len + i] = input[inputPos + i]; + } + this.block_len += take as u8; + inputPos += take; + } + } + + output(): Output { + const block_words = new StaticArray(16); + words_from_little_endian_bytes(this.block, block_words); + return new Output( + this.chaining_value, + block_words, + this.chunk_counter, + this.block_len, + this.flags | this.start_flag() | CHUNK_END + ); + } +} + +class Output { + input_chaining_value: StaticArray; + block_words: StaticArray; + counter: u64; + block_len: u32; + flags: u32; + + constructor( + input_chaining_value: StaticArray, + block_words: StaticArray, + counter: u64, + block_len: u32, + flags: u32 + ) { + this.input_chaining_value = input_chaining_value; + this.block_words = block_words; + this.counter = counter; + this.block_len = block_len; + this.flags = flags; + } + + chaining_value(): StaticArray { + const compressed = compress(this.input_chaining_value, this.block_words, this.counter, this.block_len, this.flags); + const result = new StaticArray(8); + for (let i = 0; i < 8; i++) { + result[i] = compressed[i]; + } + return result; + } + + root_output_bytes(out: Uint8Array): void { + let output_block_counter: u64 = 0; + for (let i = 0; i < out.length; i += 2 * OUT_LEN) { + const words = compress( + this.input_chaining_value, + this.block_words, + output_block_counter, + this.block_len, + this.flags | ROOT + ); + const out_block = out.subarray(i, i + 2 * OUT_LEN); + for (let j = 0; j < words.length; j++) { + const word = words[j]; + const offset = j * 4; + if (offset < out_block.length) { + out_block[offset] = word & 0xff; + if (offset + 1 < out_block.length) { + out_block[offset + 1] = (word >> 8) & 0xff; + if (offset + 2 < out_block.length) { + out_block[offset + 2] = (word >> 16) & 0xff; + if (offset + 3 < out_block.length) { + out_block[offset + 3] = (word >> 24) & 0xff; + } + } + } + } + } + output_block_counter++; + } + } +} + +function parent_output( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): Output { + const block_words = new StaticArray(16); + for (let i = 0; i < 8; i++) { + block_words[i] = left_child_cv[i]; + block_words[i + 8] = right_child_cv[i]; + } + return new Output(key_words, block_words, 0, BLOCK_LEN, PARENT | flags); +} + +function parent_cv( + left_child_cv: StaticArray, + right_child_cv: StaticArray, + key_words: StaticArray, + flags: u32 +): StaticArray { + return parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value(); +} + +export function blake3(input: Uint8Array): Uint8Array { + const hasher = new Blake3Hasher(); + hasher.update(input); + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} + +export function blake3Hex(input: Uint8Array): string { + const hash = blake3(input); + const hex = new Array(64); + for (let i = 0; i < 32; i++) { + hex[i * 2] = (hash[i] >> 4).toString(16); + hex[i * 2 + 1] = (hash[i] & 0x0f).toString(16); + } + return hex.join(""); +} + +export function createHasher(): Blake3Hasher { + return new Blake3Hasher(); +} + +export function update(hasher: Blake3Hasher, input: Uint8Array): void { + hasher.update(input); +} + +export function finalize(hasher: Blake3Hasher): Uint8Array { + const output = new Uint8Array(32); + hasher.finalize(output); + return output; +} diff --git a/packages/blake3-wasm/assembly/index.ts b/packages/blake3-wasm/assembly/index.ts new file mode 100644 index 0000000000..8183303929 --- /dev/null +++ b/packages/blake3-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +// Re-export everything from blake3.ts +export * from "./blake3"; diff --git a/packages/blake3-wasm/assembly/tsconfig.json b/packages/blake3-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..8131d68a0a --- /dev/null +++ b/packages/blake3-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/blake3-wasm/build/.gitignore b/packages/blake3-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/blake3-wasm/package.json b/packages/blake3-wasm/package.json new file mode 100644 index 0000000000..f7350c6941 --- /dev/null +++ b/packages/blake3-wasm/package.json @@ -0,0 +1,33 @@ +{ + "name": "@huggingface/blake3-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "blake3", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "0.27.36" + } +} diff --git a/packages/blake3-wasm/pnpm-lock.yaml b/packages/blake3-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..9d7ac0a92a --- /dev/null +++ b/packages/blake3-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/blake3-wasm/tests/index.js b/packages/blake3-wasm/tests/index.js new file mode 100644 index 0000000000..55463e9495 --- /dev/null +++ b/packages/blake3-wasm/tests/index.js @@ -0,0 +1,163 @@ +// Adapted from https://github.com/mcmilk/BLAKE3-tests/blob/11a8abeceac93b5eba664eae3679efb4ffa5bc0a/blake3_test.c + +import { blake3Hex } from "../build/debug.js"; + +const buffer = new Uint8Array(102400); +let i = 0; +let j = 0; + +for (i = 0, j = 0; i < buffer.length; i++, j++) { + if (j === 251) { + j = 0; + } + buffer[i] = j; +} + +const testCases = [ + { + buf: buffer.slice(0, 0), + expected: "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262", + }, + { + buf: buffer.slice(0, 1), + expected: "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213", + }, + { + buf: buffer.slice(0, 2), + expected: "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63", + }, + { + buf: buffer.slice(0, 3), + expected: "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f", + }, + { + buf: buffer.slice(0, 4), + expected: "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f32", + }, + { + buf: buffer.slice(0, 5), + expected: "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2", + }, + { + buf: buffer.slice(0, 6), + expected: "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844", + }, + { + buf: buffer.slice(0, 7), + expected: "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe", + }, + { + buf: buffer.slice(0, 8), + expected: "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb", + }, + { + buf: buffer.slice(0, 63), + expected: "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b", + }, + { + buf: buffer.slice(0, 64), + expected: "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98", + }, + { + buf: buffer.slice(0, 65), + expected: "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee", + }, + { + buf: buffer.slice(0, 127), + expected: "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640d", + }, + { + buf: buffer.slice(0, 128), + expected: "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45ef", + }, + { + buf: buffer.slice(0, 129), + expected: "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12", + }, + { + buf: buffer.slice(0, 1023), + expected: "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11", + }, + { + buf: buffer.slice(0, 1024), + expected: "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af7", + }, + { + buf: buffer.slice(0, 1025), + expected: "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444", + }, + { + buf: buffer.slice(0, 2048), + expected: "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a", + }, + { + buf: buffer.slice(0, 2049), + expected: "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b6879522563030", + }, + { + buf: buffer.slice(0, 3072), + expected: "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd2", + }, + { + buf: buffer.slice(0, 3073), + expected: "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd3", + }, + { + buf: buffer.slice(0, 4096), + expected: "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e969", + }, + { + buf: buffer.slice(0, 4097), + expected: "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb995", + }, + { + buf: buffer.slice(0, 5120), + expected: "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833", + }, + { + buf: buffer.slice(0, 5121), + expected: "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff", + }, + { + buf: buffer.slice(0, 6144), + expected: "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca205", + }, + { + buf: buffer.slice(0, 6145), + expected: "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f", + }, + { + buf: buffer.slice(0, 7168), + expected: "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a", + }, + { + buf: buffer.slice(0, 7169), + expected: "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e7817", + }, + { + buf: buffer.slice(0, 8192), + expected: "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a63", + }, + { + buf: buffer.slice(0, 8193), + expected: "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3b", + }, + { + buf: buffer.slice(0, 102400), + expected: "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085", + }, +]; + +for (const testCase of testCases) { + const result = blake3Hex(testCase.buf); + console.log(result); + + if (result !== testCase.expected) { + console.error(`Test case failed: ${testCase.buf.length} bytes`); + console.error(`Expected: ${testCase.expected}`); + console.error(`Actual: ${result}`); + process.exit(1); + } +} + +console.log("All test cases passed"); diff --git a/packages/blake3-wasm/vendor/Cargo.lock b/packages/blake3-wasm/vendor/Cargo.lock new file mode 100644 index 0000000000..9f0162bf75 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "blake3-example" +version = "0.1.0" diff --git a/packages/blake3-wasm/vendor/Cargo.toml b/packages/blake3-wasm/vendor/Cargo.toml new file mode 100644 index 0000000000..7f31968ed3 --- /dev/null +++ b/packages/blake3-wasm/vendor/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "blake3-example" +version = "0.1.0" +edition = "2021" + +[lib] +name = "reference_impl" +path = "src/lib.rs" + +[[bin]] +name = "blake3-example" +path = "src/main.rs" \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/README.md b/packages/blake3-wasm/vendor/README.md new file mode 100644 index 0000000000..46cce0d076 --- /dev/null +++ b/packages/blake3-wasm/vendor/README.md @@ -0,0 +1,27 @@ +# BLAKE3 Example + +This is a simple example that demonstrates using the BLAKE3 hash function with empty input. + +## Prerequisites + +- Rust and Cargo installed on your system. You can install them from [rustup.rs](https://rustup.rs/) + +## Running the Example + +1. Open a terminal in this directory +2. Run the following command: + ```bash + cargo run + ``` + +The program will output a 32-byte hash in hexadecimal format. For empty input, the expected output should be: +``` +af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 +``` + +## What the Code Does + +1. Creates a new BLAKE3 hasher +2. Updates it with empty input +3. Finalizes the hash into a 32-byte buffer +4. Prints the hash in hexadecimal format \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/blake3.rs b/packages/blake3-wasm/vendor/src/blake3.rs new file mode 100644 index 0000000000..bc701784f8 --- /dev/null +++ b/packages/blake3-wasm/vendor/src/blake3.rs @@ -0,0 +1,376 @@ +// From https://github.com/BLAKE3-team/BLAKE3/blob/master/reference_impl/reference_impl.rs + +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let counter_low = counter as u32; + let counter_high = (counter >> 32) as u32; + #[rustfmt::skip] + let mut state = [ + chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3], + chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7], + IV[0], IV[1], IV[2], IV[3], + counter_low, counter_high, block_len, flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + debug_assert_eq!(bytes.len(), 4 * words.len()); + for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { + *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key_words, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + counter: self.chunk_counter, + block_len: self.block_len as u32, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key_words, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key_words: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key_words: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key_words, 0, flags), + key_words, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key_words, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/packages/blake3-wasm/vendor/src/lib.rs b/packages/blake3-wasm/vendor/src/lib.rs new file mode 100644 index 0000000000..874b108ebf --- /dev/null +++ b/packages/blake3-wasm/vendor/src/lib.rs @@ -0,0 +1,3 @@ +mod blake3; + +pub use blake3::*; \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/src/main.rs b/packages/blake3-wasm/vendor/src/main.rs new file mode 100644 index 0000000000..76a1537cbd --- /dev/null +++ b/packages/blake3-wasm/vendor/src/main.rs @@ -0,0 +1,30 @@ +use std::io::Write; + +fn main() { + println!("Starting BLAKE3 hash computation for empty input"); + + // Create a new hasher + let mut hasher = reference_impl::Hasher::new(); + println!("Created new hasher"); + + // Update with empty input + let input = &[0u8, 1u8]; + println!("Input length: {} bytes", input.len()); + hasher.update(input); + println!("Updated hasher with input"); + + // Create a buffer for the output + let mut output = [0u8; 32]; + + // Get the hash + hasher.finalize(&mut output); + println!("Finalized hash computation"); + + // Print the hash in hex format + let mut stdout = std::io::stdout(); + print!("Final hash: "); + for byte in output { + write!(stdout, "{:02x}", byte).unwrap(); + } + println!(); +} \ No newline at end of file diff --git a/packages/blake3-wasm/vendor/target/.gitignore b/packages/blake3-wasm/vendor/target/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/blake3-wasm/vendor/target/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/gearhash-wasm/README.md b/packages/gearhash-wasm/README.md new file mode 100644 index 0000000000..2c0de81843 --- /dev/null +++ b/packages/gearhash-wasm/README.md @@ -0,0 +1,85 @@ +JS and WASM implementations of https://github.com/srijs/rust-gearhash + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { nextMatch } from '@huggingface/gearhash-wasm'; + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1000000); // Example: 1MB of data +// ... fill data with your content ... + +const mask = 0x0000d90003530000n; // Example mask as a BigInt, more 1s in binary repr => bigger chunks +//^ it has 11 1s in binary, so chunks will be ~2048 long +const match = nextMatch(data, mask); +const allMatches = nextMatches(data, mask).matches; +``` + +The `nextMatch` function takes two parameters: +- `data`: A Uint8Array containing the data to search through +- `mask`: A BigInt, the more 1s it has in its binary representation, the bigger the chunk + +The function returns an object with the `position` (i32) and `hash` (u64) properties + +You can continuously feed data like this: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; // extra length not processed +for await (const chunk of dataSource) { + let index = 0; + while (1) { + let match = nextMatch(chunk.subArray(index), mask, hash); + + if (match.position !== -1) { + console.log({ + length: match.position + length, + hash: match.hash + }) + + index += match.position; + length = 0; + hash = 0n; + } else { + length += chunk.length - index; + break; + } + } +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +or, more performant with `nextMatches`: + +```javascript +let hash = 0n; +const mask = 0x0000d90003530000n; + +let length = 0; +for await (const chunk of dataSource) { + const result = nextMatches(chunk, mask, hash); + let lastPosition = 0; + for (const match of result.matches) { + console.log({ + length: match.position - lastPosition + length, + hash: match.hash + }); + + length = 0; + lastPosition = match.position; + } + length = result.remaining; + hash = result.hash; +} + +console.log(length, "bytes without a match, ending hash: ", hash); +``` + +## Possible improvements + +SIMD \ No newline at end of file diff --git a/packages/gearhash-wasm/asconfig.json b/packages/gearhash-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/gearhash-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/gearhash-wasm/assembly/index.ts b/packages/gearhash-wasm/assembly/index.ts new file mode 100644 index 0000000000..447e7776f7 --- /dev/null +++ b/packages/gearhash-wasm/assembly/index.ts @@ -0,0 +1,2 @@ +export { DEFAULT_TABLE } from "./table"; +export { nextMatch, nextMatches } from "./next-match"; diff --git a/packages/gearhash-wasm/assembly/next-match.ts b/packages/gearhash-wasm/assembly/next-match.ts new file mode 100644 index 0000000000..1093f77a80 --- /dev/null +++ b/packages/gearhash-wasm/assembly/next-match.ts @@ -0,0 +1,46 @@ +// The entry file of your WebAssembly module. + +import { DEFAULT_TABLE } from "./table"; + +// Interface for the match result +export class MatchResult { + position: i32 = -1; + hash: u64 = 0; +} + +// Function to find the next match in the buffer +export function nextMatch(buf: Uint8Array, mask: u64, hash: u64 = 0): MatchResult { + for (let i = 0; i < buf.length; i++) { + const b = buf[i]; + hash = (hash << 1) + DEFAULT_TABLE[b]; + + if ((hash & mask) == 0) { + return { position: i + 1, hash }; + } + } + + return { position: -1, hash }; // Return -1 position to indicate no match found, along with the final hash +} + +export class NextMatchesResult { + matches: MatchResult[] = []; + hash: u64 = 0; + remaining: i32 = 0; +} + +export function nextMatches(buf: Uint8Array, mask: u64, hash: u64 = 0): NextMatchesResult { + const result = new NextMatchesResult(); + + let match = nextMatch(buf, mask, hash); + let position = 0; + while (match.position !== -1) { + result.matches.push(match); + position += match.position; + match = nextMatch(buf.subarray(position), mask, 0); + } + + result.remaining = buf.length - position; + result.hash = match.hash; + + return result; +} diff --git a/packages/gearhash-wasm/assembly/table.ts b/packages/gearhash-wasm/assembly/table.ts new file mode 100644 index 0000000000..22a9e52df9 --- /dev/null +++ b/packages/gearhash-wasm/assembly/table.ts @@ -0,0 +1,57 @@ +/* eslint-disable @typescript-eslint/no-loss-of-precision */ + +// Define the Table type as a static array of u64 values +export const DEFAULT_TABLE: StaticArray = [ + 0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777, 0x368f573e8b7a31b7, + 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb, 0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, + 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad, 0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, + 0x99b07edc1570ad0f, 0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5, 0x1fba37801a9ceacc, + 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3, 0xe4accf9e6211f420, 0x2520e71f87579071, + 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873, 0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, + 0x86a6e5da1b09c2b1, 0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079, 0x484f7e9c97b2e199, + 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8, 0x000070940d87955a, 0x8ae69108139e626f, + 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf, 0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, + 0xb0b219e6977d4c47, 0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687, 0x755a99374f4a5b07, + 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8, 0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, + 0xec550255e6641b44, 0x78fb94a8449c14c6, 0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, + 0x14675f0b48ea4144, 0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c, 0x228d21f6ad450890, + 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58, 0x0c10ca932b3c0deb, 0x2727fee884afed7b, + 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523, 0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, + 0xc2861181ddf18959, 0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083, + 0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1, 0xbee1797174e22416, + 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15, 0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, + 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635, 0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, + 0xf67a02bd8784b54f, 0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5, 0x267310178e08a22e, + 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06, 0xa315f5ebfb706d26, 0x8816c34e3301bace, + 0xe9395b9cbb71fdae, 0x002ce9202e721648, 0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, + 0xb8e0be4039fbc47c, 0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489, 0x8a1872a22b01f584, + 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313, 0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, + 0x762ebf3759d75a5b, 0x207bfe823d693975, 0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, + 0xb27b1a29fc5e7816, 0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01, 0x77f8ae30ac277c5d, + 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b, 0xf3c40afa60de1104, 0x2063127aa59167c3, + 0x621de62269d1894d, 0xd188ac1de62b4726, 0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, + 0xd9d6de6611b9f602, 0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e, + 0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7, 0x2910dfc75a4b5221, + 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65, 0xf05086a71257941b, 0xfec3b215d351cead, + 0x00ae1055e0144202, 0xf54b40846f42e454, 0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, + 0x39ce4957a5e5d8d4, 0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a, 0xc22e770f4531689d, + 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e, 0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, + 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5, 0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, + 0x41fce516cd88f299, 0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b, 0xda3f90178401b18e, + 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9, 0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, + 0xecef1f410033e78a, 0x0024c2b274ac72cb, 0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, + 0xc986e3c76178739b, 0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb, + 0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87, 0x84321e13b9bbc816, + 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8, 0x00004f63381b10c3, 0x07d5b7816fcc4e10, + 0xe5a536726a6a8155, 0x57afb23447a07fdd, 0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, + 0x63c7a906c1dd187b, +]; diff --git a/packages/gearhash-wasm/assembly/tsconfig.json b/packages/gearhash-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..f81c3d55e6 --- /dev/null +++ b/packages/gearhash-wasm/assembly/tsconfig.json @@ -0,0 +1,6 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": [ + "./**/*.ts" + ] +} \ No newline at end of file diff --git a/packages/gearhash-wasm/build/.gitignore b/packages/gearhash-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/gearhash-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/gearhash-wasm/package.json b/packages/gearhash-wasm/package.json new file mode 100644 index 0000000000..60d0ae0cd9 --- /dev/null +++ b/packages/gearhash-wasm/package.json @@ -0,0 +1,33 @@ +{ + "name": "@huggingface/gearhash-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "gearhash", + "assemblyscript", + "assembly", + "wasm" + ], + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "0.27.36" + } +} diff --git a/packages/gearhash-wasm/pnpm-lock.yaml b/packages/gearhash-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..9d7ac0a92a --- /dev/null +++ b/packages/gearhash-wasm/pnpm-lock.yaml @@ -0,0 +1,38 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + devDependencies: + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/gearhash-wasm/tests/index.js b/packages/gearhash-wasm/tests/index.js new file mode 100644 index 0000000000..dfed8e01ae --- /dev/null +++ b/packages/gearhash-wasm/tests/index.js @@ -0,0 +1,252 @@ +import { nextMatch, nextMatches } from "../build/debug.js"; + +// Simple deterministic RNG for reproducible results (24-bit version) +class SimpleRng { + constructor(seed) { + this.state = seed & 0xffffff; // Keep only 24 bits + } + + nextU24() { + // Simple 24-bit linear congruential generator + // Using 24-bit arithmetic to avoid overflow + this.state = (this.state * 1111 + 12345) & 0xffffff; + return this.state; + } + + fillBytes(dest) { + for (let i = 0; i < dest.length; i += 3) { + const value = this.nextU24(); + for (let j = 0; j < 3 && i + j < dest.length; j++) { + dest[i + j] = (value >> (j * 8)) & 0xff; + } + } + } +} + +const BENCH_INPUT_SEED = 0xbecd17f; +const BENCH_MASK = 0x0000d90003530000n; +const INPUT_SIZE = 100_000; + +function generateTestInput() { + const bytes = new Uint8Array(INPUT_SIZE); + const rng = new SimpleRng(BENCH_INPUT_SEED); + rng.fillBytes(bytes); + return bytes; +} + +function testGearhash() { + console.log(`Generating test input with seed: 0x${BENCH_INPUT_SEED.toString(16)}`); + const inputBuf = generateTestInput(); + console.log(`Input size: ${inputBuf.length} bytes`); + console.log(`Mask: 0x${BENCH_MASK.toString(16)}`); + + let offset = 0; + let chunkCount = 0; + let totalProcessed = 0; + let hash = 0n; + + console.log("\nProcessing chunks:"); + console.log("Chunk | Offset | Size | Hash"); + console.log("------|--------|------|------------------"); + + const result = nextMatches(inputBuf, BENCH_MASK, 0); + const matches = [...result.matches, { position: result.remaining, hash: result.hash }]; + + for (const match of matches) { + totalProcessed += match.position; + chunkCount += 1; + hash = match.hash; + + console.log( + `${chunkCount.toString().padStart(5)} | ${offset.toString().padStart(6)} | ${match.position + .toString() + .padStart(4)} | 0x${match.hash.toString(16).padStart(16, "0")}` + ); + offset += match.position; + } + + console.log("\nSummary:"); + console.log(`Total chunks: ${chunkCount}`); + console.log(`Total bytes processed: ${totalProcessed}`); + console.log(`Average chunk size: ${(totalProcessed / chunkCount).toFixed(1)} bytes`); + + // Print first few bytes of each chunk for verification + console.log("\nFirst 16 bytes of each chunk:"); + offset = 0; + chunkCount = 0; + hash = 0n; + + while (offset < inputBuf.length) { + const result = nextMatch(inputBuf.subarray(offset), BENCH_MASK, hash); + if (result.matchSize > 0) { + const chunk = inputBuf.subarray(offset, offset + result.matchSize); + const hexBytes = Array.from(chunk.slice(0, Math.min(16, chunk.length))) + .map((b) => b.toString(16).padStart(2, "0")) + .join(""); + console.log(`Chunk ${chunkCount + 1}: ${hexBytes}`); + offset += result.matchSize; + chunkCount += 1; + hash = result.hash; + } else { + const chunk = inputBuf.subarray(offset); + const hexBytes = Array.from(chunk.slice(0, Math.min(16, chunk.length))) + .map((b) => b.toString(16).padStart(2, "0")) + .join(""); + console.log(`Chunk ${chunkCount + 1}: ${hexBytes} (final)`); + break; + } + } + + return { chunkCount, totalProcessed, averageChunkSize: totalProcessed / chunkCount }; +} + +// Parse the expected results from Rust +function parseExpectedResults(resultData) { + const lines = resultData.trim().split("\n"); + const results = []; + + for (const line of lines) { + const match = line.match(/\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(0x[a-f0-9]+)/); + if (match) { + results.push({ + chunk: parseInt(match[1]), + offset: parseInt(match[2]), + size: parseInt(match[3]), + hash: match[4], + }); + } + } + + return results; +} + +const resultData = `Chunk | Offset | Size | Hash +------|--------|------|------------------ + 1 | 0 | 3598 | 0x033220f080ac5f77 + 2 | 3598 | 3995 | 0xd06b22f324ac5f28 + 3 | 7593 | 4708 | 0xa3a324f81808429c + 4 | 12301 | 484 | 0x12a5006aa4a4425b + 5 | 12785 | 1484 | 0x0b240413a4a4d5a2 + 6 | 14269 | 563 | 0xc646022fbc848bc6 + 7 | 14832 | 6663 | 0x7c7a2296e4a4c325 + 8 | 21495 | 1220 | 0xbe1f2468f0841b68 + 9 | 22715 | 1175 | 0xf87e2299e00c57d9 + 10 | 23890 | 779 | 0x79ca2634d00cd6b9 + 11 | 24669 | 2069 | 0xcb7a063594081a74 + 12 | 26738 | 2623 | 0xdccc26b6c0acb733 + 13 | 29361 | 596 | 0x4fb6201a1c20143e + 14 | 29957 | 622 | 0x81e726272020706f + 15 | 30579 | 3834 | 0x630622fca084a60a + 16 | 34413 | 2379 | 0x177b2240080810b1 + 17 | 36792 | 3527 | 0x663b261bbc2451ed + 18 | 40319 | 1665 | 0xf94f06db94003e2f + 19 | 41984 | 1240 | 0xc5ca208c0c24cefc + 20 | 43224 | 1274 | 0x8139244f740cba39 + 21 | 44498 | 3680 | 0x4440044520045a9d + 22 | 48178 | 1487 | 0xe00f2049a0a43a58 + 23 | 49665 | 4293 | 0x366a26940408279d + 24 | 53958 | 1184 | 0x3a582683902cb3fe + 25 | 55142 | 383 | 0x002d0499e080702e + 26 | 55525 | 1206 | 0x34ba041aa4084fbd + 27 | 56731 | 506 | 0x0c53045c00a0a228 + 28 | 57237 | 8019 | 0xf85b202d9c0813a5 + 29 | 65256 | 1070 | 0x1c862295ac8863ba + 30 | 66326 | 3359 | 0x4e4804d7b82805c7 + 31 | 69685 | 1744 | 0x75b7224cc8209457 + 32 | 71429 | 152 | 0xb01e26b40c0cf7c0 + 33 | 71581 | 11 | 0xc66002b7f48c0472 + 34 | 71592 | 1209 | 0x0a33021dc4007363 + 35 | 72801 | 1795 | 0xd0cc22ea708c921f + 36 | 74596 | 856 | 0x49e3007c9c2c5727 + 37 | 75452 | 97 | 0xe0b422e3c40c89dc + 38 | 75549 | 1299 | 0xbd1806074024536a + 39 | 76848 | 131 | 0xd61104147c28928d + 40 | 76979 | 1987 | 0x31930627a080ebb0 + 41 | 78966 | 11254 | 0x4c4400e65c24beff + 42 | 90220 | 868 | 0xa92400ca5ca02488 + 43 | 91088 | 6279 | 0x5a3d0443f0a0d81a + 44 | 97367 | 969 | 0x7770042d140c7472 + 45 | 98336 | 1664 | 0xe508202f55c46d2d`; + +console.log("ok"); + +// Run the test and capture output for comparison +console.log("\n" + "=".repeat(50)); +console.log("RUNNING GEARHASH TEST"); +console.log("=".repeat(50)); + +// Capture console output for comparison +const originalLog = console.log; +let capturedOutput = []; + +console.log = function (...args) { + capturedOutput.push(args.join(" ")); + originalLog.apply(console, args); +}; + +// Run the test +const testResults = testGearhash(); + +// Restore console.log +console.log = originalLog; + +// Extract the chunk data from captured output +const chunkLines = capturedOutput.filter((line) => line.match(/^\s*\d+\s*\|\s*\d+\s*\|\s*\d+\s*\|\s*0x[a-f0-9]+/)); + +// Format the captured results for comparison +const capturedResultData = chunkLines.join("\n"); + +console.log("\n" + "=".repeat(50)); +console.log("COMPARISON RESULTS"); +console.log("=".repeat(50)); + +// Compare with expected results +const expectedResults = parseExpectedResults(resultData); +const actualResults = parseExpectedResults(capturedResultData); + +let matches = 0; +let totalChunks = Math.min(actualResults.length, expectedResults.length); + +console.log(`Comparing ${totalChunks} chunks...`); + +for (let i = 0; i < totalChunks; i++) { + const actual = actualResults[i]; + const expected = expectedResults[i]; + + if (actual.offset === expected.offset && actual.size === expected.size && actual.hash === expected.hash) { + matches++; + } else { + console.log(`āŒ Mismatch at chunk ${i + 1}:`); + console.log(` Expected: offset=${expected.offset}, size=${expected.size}, hash=${expected.hash}`); + console.log(` Actual: offset=${actual.offset}, size=${actual.size}, hash=${actual.hash}`); + process.exitCode = 1; + } +} + +console.log(`\nāœ… Results: ${matches}/${totalChunks} chunks match exactly`); +console.log(`šŸ“Š Accuracy: ${((matches / totalChunks) * 100).toFixed(2)}%`); + +if (matches === totalChunks) { + console.log("šŸŽ‰ All chunks match! AssemblyScript implementation is correct."); +} else { + console.log("āš ļø Some chunks don't match. Check the implementation."); +} + +// Test summary +console.log("\n" + "=".repeat(50)); +console.log("TEST SUMMARY"); +console.log("=".repeat(50)); +console.log(`Total chunks processed: ${testResults.chunkCount}`); +console.log(`Total bytes processed: ${testResults.totalProcessed}`); +console.log(`Average chunk size: ${testResults.averageChunkSize.toFixed(1)} bytes`); +console.log(`Matching chunks: ${matches}/${totalChunks}`); +console.log(`Accuracy: ${((matches / totalChunks) * 100).toFixed(2)}%`); + +const input = generateTestInput().slice(0, 100); + +let output = ""; +for (let i = 0; i < input.length; i++) { + output += input[i].toString(16).padStart(2, "0") + " "; +} + +console.log("First 100 bytes", output); diff --git a/packages/gearhash-wasm/vendor/.gitignore b/packages/gearhash-wasm/vendor/.gitignore new file mode 100644 index 0000000000..293dd90a84 --- /dev/null +++ b/packages/gearhash-wasm/vendor/.gitignore @@ -0,0 +1,4 @@ +/target +**/*.rs.bk +Cargo.lock +.idea \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/Cargo.toml b/packages/gearhash-wasm/vendor/Cargo.toml new file mode 100644 index 0000000000..e425f8932a --- /dev/null +++ b/packages/gearhash-wasm/vendor/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "gearhash" +version = "0.1.3" +edition = "2018" +license = "MIT OR Apache-2.0" +authors = ["Sam Rijs "] +description = "Fast, SIMD-accelerated hash function for content-defined chunking" +repository = "https://github.com/srijs/rust-gearhash" +readme = "README.md" +keywords = ["hash", "gear", "fast", "cdc", "chunking"] + +[features] +bench = [] + +[dependencies] +cfg-if = "0.1.10" + +[dev-dependencies] +lazy_static = "1.4.0" +quickcheck = "0.9.0" +rand = "0.7.2" + +[[bin]] +name = "test_gearhash" +path = "test_gearhash.rs" \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/LICENSE-APACHE b/packages/gearhash-wasm/vendor/LICENSE-APACHE new file mode 100644 index 0000000000..2cdf43fa3e --- /dev/null +++ b/packages/gearhash-wasm/vendor/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/LICENSE-MIT b/packages/gearhash-wasm/vendor/LICENSE-MIT new file mode 100644 index 0000000000..487d7160eb --- /dev/null +++ b/packages/gearhash-wasm/vendor/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Sam Rijs and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/README.md b/packages/gearhash-wasm/vendor/README.md new file mode 100644 index 0000000000..34310148b7 --- /dev/null +++ b/packages/gearhash-wasm/vendor/README.md @@ -0,0 +1,60 @@ +# GearHash Test + +This directory contains the GearHash library for content-defined chunking. + +## Running the Test + +To run the test that generates deterministic input and processes it through GearHash: + +```bash +cd packages/gearhash-wasm/vendor +cargo run --bin test_gearhash +``` + +Or if you want to create a binary: + +```bash +cargo build --bin test_gearhash +./target/debug/test_gearhash +``` + +## Test Details + +The test: +1. Generates a 1MB deterministic input using a simple xorshift RNG with seed `0xa383d96f7becd17e` +2. Uses mask `0x0000d90003530000` for chunk boundary detection +3. Processes the input through GearHash and reports chunk boundaries +4. Shows chunk sizes, offsets, and hash values for verification + +## AssemblyScript Adaptation + +The test uses a simple deterministic RNG that can be easily ported to AssemblyScript: + +```typescript +class SimpleRng { + private state: u64; + + constructor(seed: u64) { + this.state = seed; + } + + nextU64(): u64 { + this.state ^= this.state << 13; + this.state ^= this.state >> 7; + this.state ^= this.state << 17; + return this.state; + } + + fillBytes(dest: Uint8Array): void { + for (let i = 0; i < dest.length; i += 8) { + const value = this.nextU64(); + for (let j = 0; j < 8 && i + j < dest.length; j++) { + dest[i + j] = (value >> (j * 8)) as u8; + } + } + } +} +``` + +The test results can be used to verify that the AssemblyScript implementation produces the same chunk boundaries. + diff --git a/packages/gearhash-wasm/vendor/src/lib.rs b/packages/gearhash-wasm/vendor/src/lib.rs new file mode 100644 index 0000000000..58aa95aeee --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/lib.rs @@ -0,0 +1,103 @@ +// From https://github.com/srijs/rust-gearhash/blob/master/src/lib.rs + +//! The GEAR hashing function is a fast, rolling hash function that +//! is well suited for content defined chunking. In particular, it is +//! used as a building block for the [FastCDC](https://www.usenix.org/node/196197) +//! algorithm. +//! +//! The implementation provided in this crate consists of both a simple, +//! scalar variant, as well as optimized versions for the SSE4.2 and AVX2 +//! instruction sets. +//! +//! ## Example +//! +//! ``` +//! fn find_all_chunks(buf: &[u8], mask: u64) -> Vec<&[u8]> { +//! // set up initial state +//! let mut chunks = vec![]; +//! let mut offset = 0; +//! +//! // create new hasher +//! let mut hasher = gearhash::Hasher::default(); +//! +//! // loop through all matches, and push the corresponding chunks +//! while let Some(boundary) = hasher.next_match(&buf[offset..], mask) { +//! chunks.push(&buf[offset..offset + boundary]); +//! offset += boundary; +//! } +//! +//! // push final chunk +//! chunks.push(&buf[offset..]); +//! chunks +//! } +//! ``` + +#![cfg_attr(feature = "bench", feature(test))] + +#[cfg(feature = "bench")] +extern crate test; +#[cfg(feature = "bench")] +mod bench; + +mod scalar; +mod table; + +pub use table::{Table, DEFAULT_TABLE}; + +/// Gear hash state. Processes bytes to find chunk boundaries. +#[derive(Clone)] +pub struct Hasher<'t> { + table: &'t Table, + hash: u64, +} + +impl<'t> Hasher<'t> { + /// Create a new hasher with the given table. + pub fn new(table: &'t Table) -> Self { + Self { table, hash: 0 } + } + + /// Update the hash state by processing all the bytes in the given slice. + pub fn update(&mut self, buf: &[u8]) { + for b in buf.iter() { + self.hash = (self.hash << 1).wrapping_add(self.table[*b as usize]); + } + } + + /// Match the current hash state against the given mask. + /// + /// Returns true if `hash & mask == 0`, false otherwise. + pub fn is_match(&self, mask: u64) -> bool { + self.hash & mask == 0 + } + + /// Processes the given byte slice until a match is found for the given mask. + /// + /// If a match is found before the end of the byte slice, it returns the number + /// of bytes processed. If no match has been found, it returns `None`. + pub fn next_match(&mut self, buf: &[u8], mask: u64) -> Option { + crate::scalar::next_match(&mut self.hash, self.table, buf, mask) + } + + /// Retrieve the current hash value. + pub fn get_hash(&self) -> u64 { + self.hash + } + + /// Set the hash value to the given integer. + pub fn set_hash(&mut self, hash: u64) { + self.hash = hash + } +} + +impl Default for Hasher<'static> { + fn default() -> Self { + Hasher::new(&DEFAULT_TABLE) + } +} + +impl<'t> std::fmt::Debug for Hasher<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Hasher").field("hash", &self.hash).finish() + } +} \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/src/scalar.rs b/packages/gearhash-wasm/vendor/src/scalar.rs new file mode 100644 index 0000000000..f06ab449f6 --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/scalar.rs @@ -0,0 +1,14 @@ +use crate::Table; + +#[inline] +pub(crate) fn next_match(hash: &mut u64, table: &Table, buf: &[u8], mask: u64) -> Option { + for (i, b) in buf.iter().enumerate() { + *hash = (*hash << 1).wrapping_add(table[*b as usize]); + + if *hash & mask == 0 { + return Some(i + 1); + } + } + + None +} diff --git a/packages/gearhash-wasm/vendor/src/table.rs b/packages/gearhash-wasm/vendor/src/table.rs new file mode 100644 index 0000000000..898e603422 --- /dev/null +++ b/packages/gearhash-wasm/vendor/src/table.rs @@ -0,0 +1,262 @@ +/// Gear hash table. +pub type Table = [u64; 256]; + +/// Default hash table, using random (but static) integers. +pub static DEFAULT_TABLE: Table = [ + 0xb088d3a9e840f559, + 0x5652c7f739ed20d6, + 0x45b28969898972ab, + 0x6b0a89d5b68ec777, + 0x368f573e8b7a31b7, + 0x1dc636dce936d94b, + 0x207a4c4e5554d5b6, + 0xa474b34628239acb, + 0x3b06a83e1ca3b912, + 0x90e78d6c2f02baf7, + 0xe1c92df7150d9a8a, + 0x8e95053a1086d3ad, + 0x5a2ef4f1b83a0722, + 0xa50fac949f807fae, + 0x0e7303eb80d8d681, + 0x99b07edc1570ad0f, + 0x689d2fb555fd3076, + 0x00005082119ea468, + 0xc4b08306a88fcc28, + 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, + 0xf2129fbfbe6bc736, + 0x481149575c98a4ed, + 0x0000010695477bc5, + 0x1fba37801a9ceacc, + 0x3bf06fd663a49b6d, + 0x99687e9782e3874b, + 0x79a10673aa50d8e3, + 0xe4accf9e6211f420, + 0x2520e71f87579071, + 0x2bd5d3fd781a8a9b, + 0x00de4dcddd11c873, + 0xeaa9311c5a87392f, + 0xdb748eb617bc40ff, + 0xaf579a8df620bf6f, + 0x86a6e5da1b09c2b1, + 0xcc2fc30ac322a12e, + 0x355e2afec1f74267, + 0x2d99c8f4c021a47b, + 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, + 0x3286b6587bf32c20, + 0x0000b68886af270c, + 0xa115d6e4db8a9079, + 0x484f7e9c97b2e199, + 0xccca7bb75713e301, + 0xbf2584a62bb0f160, + 0xade7e813625dbcc8, + 0x000070940d87955a, + 0x8ae69108139e626f, + 0xbd776ad72fde38a2, + 0xfb6b001fc2fcc0cf, + 0xc7a474b8e67bc427, + 0xbaf6f11610eb5d58, + 0x09cb1f5b6de770d1, + 0xb0b219e6977d4c47, + 0x00ccbc386ea7ad4a, + 0xcc849d0adf973f01, + 0x73a3ef7d016af770, + 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, + 0xd037a86bc6c504da, + 0xf3f17c661eaa609d, + 0xaca626b04daae687, + 0x755a99374f4a5b07, + 0x90837ee65b2caede, + 0x6ee8ad93fd560785, + 0x0000d9e11053edd8, + 0x9e063bb2d21cdbd7, + 0x07ab77f12a01d2b2, + 0xec550255e6641b44, + 0x78fb94a8449c14c6, + 0xc7510e1bc6c0f5f5, + 0x0000320b36e4cae3, + 0x827c33262c8b1a2d, + 0x14675f0b48ea4144, + 0x267bd3a6498deceb, + 0xf1916ff982f5035e, + 0x86221b7ff434fb88, + 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, + 0x008d198692fc64d8, + 0x6d38704fbabf9a36, + 0xe032cb07d1e7be4c, + 0x228d21f6ad450890, + 0x635cb1bfc02589a5, + 0x4620a1739ca2ce71, + 0xa7e7dfe3aae5fb58, + 0x0c10ca932b3c0deb, + 0x2727fee884afed7b, + 0xa2df1c6df9e2ab1f, + 0x4dcdd1ac0774f523, + 0x000070ffad33e24e, + 0xa2ace87bc5977816, + 0x9892275ab4286049, + 0xc2861181ddf18959, + 0xbb9972a042483e19, + 0xef70cd3766513078, + 0x00000513abfc9864, + 0xc058b61858c94083, + 0x09e850859725e0de, + 0x9197fb3bf83e7d94, + 0x7e1e626d12b64bce, + 0x520c54507f7b57d1, + 0xbee1797174e22416, + 0x6fd9ac3222e95587, + 0x0023957c9adfbf3e, + 0xa01c7d7e234bbe15, + 0xaba2c758b8a38cbb, + 0x0d1fa0ceec3e2b30, + 0x0bb6a58b7e60b991, + 0x4333dd5b9fa26635, + 0xc2fd3b7d4001c1a3, + 0xfb41802454731127, + 0x65a56185a50d18cb, + 0xf67a02bd8784b54f, + 0x696f11dd67e65063, + 0x00002022fca814ab, + 0x8cd6be912db9d852, + 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, + 0xd8fc5ea91a78845e, + 0xab86bf191a4aa767, + 0x0000c6b5c86415e5, + 0x267310178e08a22e, + 0xed2d101b078bca25, + 0x3b41ed84b226a8fb, + 0x13e622120f28dc06, + 0xa315f5ebfb706d26, + 0x8816c34e3301bace, + 0xe9395b9cbb71fdae, + 0x002ce9202e721648, + 0x4283db1d2bb3c91c, + 0xd77d461ad2b1a6a5, + 0xe2ec17e46eeb866b, + 0xb8e0be4039fbc47c, + 0xdea160c4d5299d04, + 0x7eec86c8d28c3634, + 0x2119ad129f98a399, + 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, + 0x2db4871169acdd83, + 0x0000f0d6f39ecbe9, + 0x3dd5d8c98d2f9489, + 0x8a1872a22b01f584, + 0xf282a4c40e7b3cf2, + 0x8020ec2ccb1ba196, + 0x6693b6e09e59e313, + 0x0000ce19cc7c83eb, + 0x20cb5735f6479c3b, + 0x762ebf3759d75a5b, + 0x207bfe823d693975, + 0xd77dc112339cd9d5, + 0x9ba7834284627d03, + 0x217dc513e95f51e9, + 0xb27b1a29fc5e7816, + 0x00d5cd9831bb662d, + 0x71e39b806d75734c, + 0x7e572af006fb1a23, + 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, + 0x5c3beac60761a0de, + 0xcdc893bb47416998, + 0x6d1085615c187e01, + 0x77f8ae30ac277c5d, + 0x917c6b81122a2c91, + 0x5b75b699add16967, + 0x0000cf6ae79a069b, + 0xf3c40afa60de1104, + 0x2063127aa59167c3, + 0x621de62269d1894d, + 0xd188ac1de62b4726, + 0x107036e2154b673c, + 0x0000b85f28553a1d, + 0xf2ef4e4c18236f3d, + 0xd9d6de6611b9f602, + 0xa1fc7955fb47911c, + 0xeb85fd032f298dbd, + 0xbe27502fb3befae1, + 0xe3034251c4cd661e, + 0x441364d354071836, + 0x0082b36c75f2983e, + 0xb145910316fa66f0, + 0x021c069c9847caf7, + 0x2910dfc75a4b5221, + 0x735b353e1c57a8b5, + 0xce44312ce98ed96c, + 0xbc942e4506bdfa65, + 0xf05086a71257941b, + 0xfec3b215d351cead, + 0x00ae1055e0144202, + 0xf54b40846f42e454, + 0x00007fd9c8bcbcc8, + 0xbfbd9ef317de9bfe, + 0xa804302ff2854e12, + 0x39ce4957a5e5d8d4, + 0xffb9e2a45637ba84, + 0x55b9ad1d9ea0818b, + 0x00008acbf319178a, + 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, + 0x0e2712160696a08b, + 0xd51096e84b44242a, + 0x1101ba176792e13a, + 0xc22e770f4531689d, + 0x1689eff272bbc56c, + 0x00a92a197f5650ec, + 0xbc765990bda1784e, + 0xc61441e392fcb8ae, + 0x07e13a2ced31e4a0, + 0x92cbe984234e9d4d, + 0x8f4ff572bb7d8ac5, + 0x0b9670c00b963bd0, + 0x62955a581a03eb01, + 0x645f83e5ea000254, + 0x41fce516cd88f299, + 0xbbda9748da7a98cf, + 0x0000aab2fe4845fa, + 0x19761b069bf56555, + 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, + 0xec5c1e2ca2b0cd8f, + 0xfaf7e0fea7fbb57f, + 0x000000d3ba12961b, + 0xda3f90178401b18e, + 0x70ff906de33a5feb, + 0x0527d5a7c06970e7, + 0x22d8e773607c13e9, + 0xc9ab70df643c3bac, + 0xeda4c6dc8abe12e3, + 0xecef1f410033e78a, + 0x0024c2b274ac72cb, + 0x06740d954fa900b4, + 0x1d7a299b323d6304, + 0xb3c37cb298cbead5, + 0xc986e3c76178739b, + 0x9fabea364b46f58a, + 0x6da214c5af85cc56, + 0x17a43ed8b7a38f84, + 0x6eccec511d9adbeb, + 0xf9cab30913335afb, + 0x4a5e60c5f415eed2, + 0x00006967503672b4, + 0x9da51d121454bb87, + 0x84321e13b9bbc816, + 0xfb3d6fb6ab2fdd8d, + 0x60305eed8e160a8d, + 0xcbbf4b14e9946ce8, + 0x00004f63381b10c3, + 0x07d5b7816fcc4e10, + 0xe5a536726a6a8155, + 0x57afb23447a07fdd, + 0x18f346f7abc9d394, + 0x636dc655d61ad33d, + 0xcc8bab4939f7f3f6, + 0x63c7a906c1dd187b, +]; \ No newline at end of file diff --git a/packages/gearhash-wasm/vendor/test_gearhash.rs b/packages/gearhash-wasm/vendor/test_gearhash.rs new file mode 100644 index 0000000000..99606f1752 --- /dev/null +++ b/packages/gearhash-wasm/vendor/test_gearhash.rs @@ -0,0 +1,109 @@ +use gearhash::{Hasher, DEFAULT_TABLE}; + +// Simple deterministic RNG for reproducible results (24-bit version) +struct SimpleRng { + state: u32, +} + +impl SimpleRng { + fn new(seed: u32) -> Self { + Self { state: seed & 0xFFFFFF } // Keep only 24 bits + } + + fn next_u24(&mut self) -> u32 { + // Simple 24-bit linear congruential generator + // Using 24-bit arithmetic to avoid overflow + self.state = (self.state.wrapping_mul(1111) + 12345) & 0xFFFFFF; + self.state + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + for chunk in dest.chunks_mut(3) { + let value = self.next_u24(); + for (i, byte) in chunk.iter_mut().enumerate() { + *byte = ((value >> (i * 8)) & 0xFF) as u8; + } + } + } +} + +const BENCH_INPUT_SEED: u32 = 0xbecd17f; +const BENCH_MASK: u64 = 0x0000d90003530000; +const INPUT_SIZE: usize = 100_000; + +fn generate_test_input() -> Vec { + let mut bytes = vec![0u8; INPUT_SIZE]; + let mut rng = SimpleRng::new(BENCH_INPUT_SEED); + rng.fill_bytes(&mut bytes); + bytes +} + +fn test_gearhash() { + println!("Generating test input with seed: 0x{:x}", BENCH_INPUT_SEED); + let input_buf = generate_test_input(); + println!("Input size: {} bytes", input_buf.len()); + println!("Mask: 0x{:x}", BENCH_MASK); + + let mut hasher = Hasher::new(&DEFAULT_TABLE); + let mut offset = 0; + let mut chunk_count = 0; + let mut total_processed = 0; + + println!("\nProcessing chunks:"); + println!("Chunk | Offset | Size | Hash"); + println!("------|--------|------|------------------"); + + while offset < input_buf.len() { + let chunk_start = offset; + + if let Some(match_size) = hasher.next_match(&input_buf[offset..], BENCH_MASK) { + offset += match_size; + total_processed += match_size; + chunk_count += 1; + + println!("{:5} | {:6} | {:4} | 0x{:016x}", + chunk_count, chunk_start, match_size, hasher.get_hash()); + + hasher.set_hash(0); + } else { + // No more matches, process remaining bytes + let remaining = input_buf.len() - offset; + total_processed += remaining; + chunk_count += 1; + + println!("{:5} | {:6} | {:4} | 0x{:016x} (final)", + chunk_count, offset, remaining, hasher.get_hash()); + break; + } + } + + println!("\nSummary:"); + println!("Total chunks: {}", chunk_count); + println!("Total bytes processed: {}", total_processed); + println!("Average chunk size: {:.1} bytes", total_processed as f64 / chunk_count as f64); + + // Print first few bytes of each chunk for verification + println!("\nFirst 16 bytes of each chunk:"); + offset = 0; + chunk_count = 0; + + while offset < input_buf.len() { + if let Some(match_size) = hasher.next_match(&input_buf[offset..], BENCH_MASK) { + let chunk = &input_buf[offset..offset + match_size]; + println!("Chunk {}: {:02x?}", chunk_count + 1, &chunk[..chunk.len().min(16)]); + offset += match_size; + chunk_count += 1; + } else { + let chunk = &input_buf[offset..]; + println!("Chunk {}: {:02x?} (final)", chunk_count + 1, &chunk[..chunk.len().min(16)]); + break; + } + } +} + +fn main() { + test_gearhash(); + + let input_buf = generate_test_input(); + println!("First 100 bytes: {:02x?}", &input_buf[..100]); +} \ No newline at end of file diff --git a/packages/xetchunk-wasm/README.md b/packages/xetchunk-wasm/README.md new file mode 100644 index 0000000000..3ab3d656a1 --- /dev/null +++ b/packages/xetchunk-wasm/README.md @@ -0,0 +1,27 @@ +JS and WASM implementations of https://github.com/huggingface/xet-core/blob/main/deduplication/src/chunking.rs + +Using [AssemblyScript](https://www.assemblyscript.org/) to generate a lean WASM. + +## Usage + +```javascript +import { createChunker, getChunks, nextBlock, finalize } from '@huggingface/xetchunk-wasm'; + +const TARGET_CHUNK_SIZE = Math.pow(2, 12); + +// Create a Uint8Array of data to search through +const data = new Uint8Array(1000000); // Example: 1MB of data +// ... fill data with your content ... + +const chunks = getChunks(data, TARGET_CHUNK_SIZE); + +// Alternative, in case your data is streaming +const chunker = createChunker(TARGET_CHUNK_SIZE); + +for await (const data of source) { + const chunks = nextBlock(chunker, data); + console.log(chunks); +} + +console.log("last chunk", finalize(chunker)); +``` diff --git a/packages/xetchunk-wasm/asconfig.json b/packages/xetchunk-wasm/asconfig.json new file mode 100644 index 0000000000..8776597856 --- /dev/null +++ b/packages/xetchunk-wasm/asconfig.json @@ -0,0 +1,22 @@ +{ + "targets": { + "debug": { + "outFile": "build/debug.wasm", + "textFile": "build/debug.wat", + "sourceMap": true, + "debug": true + }, + "release": { + "outFile": "build/release.wasm", + "textFile": "build/release.wat", + "sourceMap": true, + "optimizeLevel": 3, + "shrinkLevel": 0, + "converge": false, + "noAssert": false + } + }, + "options": { + "bindings": "esm" + } +} \ No newline at end of file diff --git a/packages/xetchunk-wasm/assembly/index.ts b/packages/xetchunk-wasm/assembly/index.ts new file mode 100644 index 0000000000..b8b8b62702 --- /dev/null +++ b/packages/xetchunk-wasm/assembly/index.ts @@ -0,0 +1 @@ +export { createChunker, finalize, nextBlock } from "./xet-chunker"; diff --git a/packages/xetchunk-wasm/assembly/tsconfig.json b/packages/xetchunk-wasm/assembly/tsconfig.json new file mode 100644 index 0000000000..8131d68a0a --- /dev/null +++ b/packages/xetchunk-wasm/assembly/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../node_modules/.pnpm/assemblyscript@0.27.36/node_modules/assemblyscript/std/assembly.json", + "include": ["./**/*.ts"] +} diff --git a/packages/xetchunk-wasm/assembly/xet-chunker.ts b/packages/xetchunk-wasm/assembly/xet-chunker.ts new file mode 100644 index 0000000000..0abfebbfec --- /dev/null +++ b/packages/xetchunk-wasm/assembly/xet-chunker.ts @@ -0,0 +1,148 @@ +import { nextMatch } from "@huggingface/gearhash-wasm/assembly"; +import { blake3 } from "@huggingface/blake3-wasm/assembly"; + +// Constants +const TARGET_CHUNK_SIZE: i32 = 64 * 1024; // 64KB +const MINIMUM_CHUNK_DIVISOR: i32 = 8; +const MAXIMUM_CHUNK_MULTIPLIER: i32 = 2; +const HASH_WINDOW_SIZE: i32 = 64; + +export class Chunk { + hash: Uint8Array; + length: i32; +} + +// Type for the next() method return value +class NextResult { + chunk: Chunk | null; + bytesConsumed: i32; + + constructor(chunk: Chunk | null, bytesConsumed: i32) { + this.chunk = chunk; + this.bytesConsumed = bytesConsumed; + } +} + +class XetChunker { + private minimumChunk: i32; + private maximumChunk: i32; + private mask: u64; + private chunkBuf: Uint8Array; + private curChunkLen: i32; + private hash: u64; + + constructor(targetChunkSize: i32 = TARGET_CHUNK_SIZE) { + // Validate target chunk size is a power of 2 + assert(targetChunkSize > 0, "Target chunk size must be greater than 0"); + assert((targetChunkSize & (targetChunkSize - 1)) == 0, "Target chunk size must be a power of 2"); + assert(targetChunkSize > HASH_WINDOW_SIZE, "Target chunk size must be greater than hash window size"); + assert(targetChunkSize < i32.MAX_VALUE, "Target chunk size must be less than i32.MAX_VALUE"); + + let mask = (targetChunkSize - 1) as u64; + // Shift mask left by leading zeros count + mask = mask << (64 - clz(mask)); + + const maximumChunk = targetChunkSize * MAXIMUM_CHUNK_MULTIPLIER; + + this.minimumChunk = targetChunkSize / MINIMUM_CHUNK_DIVISOR; + this.maximumChunk = maximumChunk; + this.mask = mask; + this.chunkBuf = new Uint8Array(maximumChunk); + this.curChunkLen = 0; + this.hash = 0; + } + + next(data: Uint8Array, isFinal: boolean): NextResult { + const nBytes = data.length; + let createChunk = false; + let consumeLen: i32 = 0; + + if (nBytes != 0) { + // Skip minimum chunk size + if (this.curChunkLen + HASH_WINDOW_SIZE < this.minimumChunk) { + const maxAdvance = min(this.minimumChunk - this.curChunkLen - HASH_WINDOW_SIZE - 1, nBytes - consumeLen); + consumeLen += maxAdvance; + this.curChunkLen += maxAdvance; + } + + // Calculate read end + const readEnd = min(nBytes, consumeLen + this.maximumChunk - this.curChunkLen); + + let bytesToNextBoundary: i32; + const matchResult = nextMatch(data.subarray(consumeLen, readEnd), this.mask, this.hash); + + if (matchResult.position != -1) { + bytesToNextBoundary = matchResult.position; + createChunk = true; + this.hash = matchResult.hash; + } else { + bytesToNextBoundary = readEnd - consumeLen; + this.hash = matchResult.hash; + } + + // Check if we hit maximum chunk + if (bytesToNextBoundary + this.curChunkLen >= this.maximumChunk) { + bytesToNextBoundary = this.maximumChunk - this.curChunkLen; + createChunk = true; + } + + this.curChunkLen += bytesToNextBoundary; + consumeLen += bytesToNextBoundary; + + // Copy data to chunk buffer + this.chunkBuf.set(data.subarray(0, consumeLen), this.curChunkLen - consumeLen); + } + + if (createChunk || (isFinal && this.curChunkLen > 0)) { + const chunkData = this.chunkBuf.subarray(0, this.curChunkLen); + const chunk: Chunk = { + length: chunkData.length, + hash: blake3(chunkData), + }; + this.curChunkLen = 0; + this.hash = 0; + return new NextResult(chunk, consumeLen); + } + + return new NextResult(null, consumeLen); + } + + nextBlock(data: Uint8Array, isFinal: boolean): Chunk[] { + const chunks: Chunk[] = []; + let pos: i32 = 0; + + while (pos < data.length) { + const result = this.next(data.subarray(pos), isFinal); + if (result.chunk) { + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + chunks.push(result.chunk!); + } + pos += result.bytesConsumed; + } + + return chunks; + } + + finish(): Chunk | null { + return this.next(new Uint8Array(0), true).chunk; + } +} + +export function createChunker(targetChunkSize: i32 = TARGET_CHUNK_SIZE): XetChunker { + const chunker = new XetChunker(targetChunkSize); + + return chunker; +} + +export function nextBlock(chunker: XetChunker, data: Uint8Array): Chunk[] { + return chunker.nextBlock(data, false); +} + +export function finalize(chunker: XetChunker): Chunk | null { + return chunker.finish(); +} + +export function getChunks(data: Uint8Array, targetChunkSize: i32 = TARGET_CHUNK_SIZE): Chunk[] { + const chunker = createChunker(targetChunkSize); + return chunker.nextBlock(data, true); +} diff --git a/packages/xetchunk-wasm/build/.gitignore b/packages/xetchunk-wasm/build/.gitignore new file mode 100644 index 0000000000..d6b7ef32c8 --- /dev/null +++ b/packages/xetchunk-wasm/build/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/packages/xetchunk-wasm/package.json b/packages/xetchunk-wasm/package.json new file mode 100644 index 0000000000..3bc5540ec6 --- /dev/null +++ b/packages/xetchunk-wasm/package.json @@ -0,0 +1,39 @@ +{ + "name": "@huggingface/xetchunk-wasm", + "version": "0.0.1", + "scripts": { + "build:debug": "asc assembly/index.ts --target debug", + "build:release": "asc assembly/index.ts --target release", + "build": "pnpm run build:debug && npm run build:release", + "test": "node tests", + "prepare": "pnpm run build" + }, + "keywords": [ + "xet", + "chunk", + "chunking", + "assemblyscript", + "assembly", + "wasm" + ], + "dependencies": { + "@huggingface/blake3-wasm": "workspace:*", + "@huggingface/gearhash-wasm": "workspace:*" + }, + "type": "module", + "exports": { + ".": { + "import": "./build/release.js", + "types": "./build/release.d.ts" + }, + "./assembly": { + "import": "./assembly/index.ts" + }, + "./wasm": { + "import": "./build/release.wasm" + } + }, + "devDependencies": { + "assemblyscript": "0.27.36" + } +} diff --git a/packages/xetchunk-wasm/pnpm-lock.yaml b/packages/xetchunk-wasm/pnpm-lock.yaml new file mode 100644 index 0000000000..4e5f34eb68 --- /dev/null +++ b/packages/xetchunk-wasm/pnpm-lock.yaml @@ -0,0 +1,45 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + '@huggingface/blake3-wasm': + specifier: workspace:* + version: link:../blake3-wasm + '@huggingface/gearhash-wasm': + specifier: workspace:* + version: link:../gearhash-wasm + devDependencies: + assemblyscript: + specifier: 0.27.36 + version: 0.27.36 + +packages: + + assemblyscript@0.27.36: + resolution: {integrity: sha512-1qX2zf6p7l/mNYv8r21jC/Yft7kX7XKR3xUHw41zvV4xad5lyC8w7jZiwZBGoy64VKZLc+bTDJDWi8Kb70YrHA==} + engines: {node: '>=18', npm: '>=10'} + hasBin: true + + binaryen@116.0.0-nightly.20240114: + resolution: {integrity: sha512-0GZrojJnuhoe+hiwji7QFaL3tBlJoA+KFUN7ouYSDGZLSo9CKM8swQX8n/UcbR0d1VuZKU+nhogNzv423JEu5A==} + hasBin: true + + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + +snapshots: + + assemblyscript@0.27.36: + dependencies: + binaryen: 116.0.0-nightly.20240114 + long: 5.3.2 + + binaryen@116.0.0-nightly.20240114: {} + + long@5.3.2: {} diff --git a/packages/xetchunk-wasm/tests/index.js b/packages/xetchunk-wasm/tests/index.js new file mode 100644 index 0000000000..f1a485f785 --- /dev/null +++ b/packages/xetchunk-wasm/tests/index.js @@ -0,0 +1,17 @@ +import { createChunker, finalize, nextBlock } from "../build/debug.js"; + +const chunker = createChunker(Math.pow(2, 12)); + +const data = new Uint8Array(100_000); + +for (let i = 0; i < data.length; i++) { + data[i] = i; +} + +const chunks = nextBlock(chunker, data); + +console.log("chunks", chunks); + +const lastChunk = finalize(chunker); + +console.log("lastChunk", lastChunk); diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 08e651bb73..bc118ffdff 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -14,3 +14,6 @@ packages: - "packages/ollama-utils" - "packages/mcp-client" - "packages/tiny-agents" + - "packages/gearhash-wasm" + - "packages/blake3-wasm" + - "packages/xetchunk-wasm" diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000000..fbe8ff6fda --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "node", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "lib": ["ESNext"], + "types": ["assemblyscript"] + } +}