Skip to content

Commit

Permalink
feat: skip compressing unicode in smaz
Browse files Browse the repository at this point in the history
  • Loading branch information
seia-soto committed Mar 5, 2025
1 parent ef0a1b8 commit 991e646
Showing 1 changed file with 96 additions and 43 deletions.
139 changes: 96 additions & 43 deletions packages/adblocker/src/data-view.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

import { Smaz } from '@remusao/smaz';
import Compression from './compression.js';
import crc32 from './crc32.js';
import { decode, encode } from './punycode.js';

interface IDataViewOptions {
enableCompression: boolean;
Expand All @@ -23,6 +23,9 @@ const LITTLE_ENDIAN: boolean = new Int8Array(new Int16Array([1]).buffer)[0] ===
// TextEncoder doesn't need to be recreated every time unlike TextDecoder
const TEXT_ENCODER = new TextEncoder();

const SMAZ_CHUNKSIG_ASCII = 0xd0;
const SMAZ_CHUNKSIG_UTF8 = 0xc0;

// Store compression in a lazy, global singleton
let getCompressionSingleton: () => Compression = () => {
const COMPRESSION = new Compression();
Expand Down Expand Up @@ -101,66 +104,77 @@ export function sizeOfUint32Array(array: Uint32Array): number {
return array.byteLength + sizeOfLength(array.length);
}

function sizeOfSmaz(str: string, smaz: Smaz): number {
let estimated = 0;
for (let i = 0, l = str.length, bp = 0; i < l; i = bp) {
bp = i + 1;
// In case of unicode string
if (str.charCodeAt(i) > 127) {
estimated += sizeOfByte();
// Find next ASCII char
for (; bp < l; bp++) {
if (str.charCodeAt(bp) <= 127) {
break;
}
}
// Skip compression
estimated += sizeOfUTF8(str.slice(i, bp));
} else {
// In case of ASCII string
estimated += sizeOfByte();
// Find next non-ASCII char
for (; bp < l; bp++) {
if (str.charCodeAt(bp) > 127) {
break;
}
}
estimated += sizeOfBytesWithLength(smaz.getCompressedSize(str.slice(i, bp)), false);
}
}
// Add NULL at the end
estimated += sizeOfByte();
return estimated;
}

export function sizeOfNetworkRedirect(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().networkRedirect.getCompressedSize(str),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().networkRedirect)
: sizeOfASCII(str);
}

export function sizeOfNetworkHostname(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().networkHostname.getCompressedSize(str),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().networkHostname)
: sizeOfASCII(str);
}

export function sizeOfNetworkCSP(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().networkCSP.getCompressedSize(str),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().networkCSP)
: sizeOfASCII(str);
}

export function sizeOfNetworkFilter(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().networkFilter.getCompressedSize(str),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().networkFilter)
: sizeOfASCII(str);
}

export function sizeOfCosmeticSelector(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().cosmeticSelector.getCompressedSize(str),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().cosmeticSelector)
: sizeOfASCII(str);
}

export function sizeOfRawNetwork(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().networkRaw.getCompressedSize(encode(str)),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().networkRaw)
: sizeOfUTF8(str);
}

export function sizeOfRawCosmetic(str: string, compression: boolean): number {
return compression === true
? sizeOfBytesWithLength(
getCompressionSingleton().cosmeticRaw.getCompressedSize(encode(str)),
false, // align
)
? sizeOfSmaz(str, getCompressionSingleton().cosmeticRaw)
: sizeOfUTF8(str);
}

Expand Down Expand Up @@ -434,107 +448,146 @@ export class StaticDataView {
return String.fromCharCode.apply(null, this.buffer.subarray(this.pos - byteLength, this.pos));
}

private pushSmaz(str: string, smaz: Smaz): void {
for (let i = 0, l = str.length, bp = 0; i < l; i = bp) {
bp = i + 1;
if (str.charCodeAt(i) > 127) {
this.pushUint8(SMAZ_CHUNKSIG_UTF8);
// Find next ASCII char
for (; bp < l; bp++) {
if (str.charCodeAt(bp) <= 127) {
break;
}
}
this.pushUTF8(str.slice(i, bp));
} else {
this.pushUint8(SMAZ_CHUNKSIG_ASCII);
// Find next non-ASCII char
for (; bp < l; bp++) {
if (str.charCodeAt(bp) > 127) {
break;
}
}
this.pushBytes(smaz.compress(str.slice(i, bp)));
}
}
this.pushUint8(0);
}

private getSmaz(smaz: Smaz): string {
let data = '';
let type: number;
while ((type = this.getUint8()) !== 0) {
if (type === SMAZ_CHUNKSIG_UTF8) {
data += this.getUTF8();
} else if (type === SMAZ_CHUNKSIG_ASCII) {
data += smaz.decompress(this.getBytes());
}
}
return data;
}

public pushNetworkRedirect(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.networkRedirect.compress(str));
this.pushSmaz(str, this.compression.networkRedirect);
} else {
this.pushASCII(str);
}
}

public getNetworkRedirect(): string {
if (this.compression !== undefined) {
return this.compression.networkRedirect.decompress(this.getBytes());
return this.getSmaz(this.compression.networkRedirect);
}
return this.getASCII();
}

public pushNetworkHostname(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.networkHostname.compress(str));
this.pushSmaz(str, this.compression.networkHostname);
} else {
this.pushASCII(str);
}
}

public getNetworkHostname(): string {
if (this.compression !== undefined) {
return this.compression.networkHostname.decompress(this.getBytes());
return this.getSmaz(this.compression.networkHostname);
}
return this.getASCII();
}

public pushNetworkCSP(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.networkCSP.compress(str));
this.pushSmaz(str, this.compression.networkCSP);
} else {
this.pushASCII(str);
}
}

public getNetworkCSP(): string {
if (this.compression !== undefined) {
return this.compression.networkCSP.decompress(this.getBytes());
return this.getSmaz(this.compression.networkCSP);
}
return this.getASCII();
}

public pushNetworkFilter(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.networkFilter.compress(str));
this.pushSmaz(str, this.compression.networkFilter);
} else {
this.pushASCII(str);
}
}

public getNetworkFilter(): string {
if (this.compression !== undefined) {
return this.compression.networkFilter.decompress(this.getBytes());
return this.getSmaz(this.compression.networkFilter);
}
return this.getASCII();
}

public pushCosmeticSelector(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.cosmeticSelector.compress(str));
this.pushSmaz(str, this.compression.cosmeticSelector);
} else {
this.pushASCII(str);
}
}

public getCosmeticSelector(): string {
if (this.compression !== undefined) {
return this.compression.cosmeticSelector.decompress(this.getBytes());
return this.getSmaz(this.compression.cosmeticSelector);
}
return this.getASCII();
}

public pushRawCosmetic(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.cosmeticRaw.compress(encode(str)));
this.pushSmaz(str, this.compression.cosmeticRaw);
} else {
this.pushUTF8(str);
}
}

public getRawCosmetic(): string {
if (this.compression !== undefined) {
return decode(this.compression.cosmeticRaw.decompress(this.getBytes()));
return this.getSmaz(this.compression.cosmeticRaw);
}
return this.getUTF8();
}

public pushRawNetwork(str: string): void {
if (this.compression !== undefined) {
this.pushBytes(this.compression.networkRaw.compress(encode(str)));
this.pushSmaz(str, this.compression.networkRaw);
} else {
this.pushUTF8(str);
}
}

public getRawNetwork(): string {
if (this.compression !== undefined) {
return decode(this.compression.networkRaw.decompress(this.getBytes()));
return this.getSmaz(this.compression.networkRaw);
}
return this.getUTF8();
}
Expand Down

0 comments on commit 991e646

Please sign in to comment.