Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 0872692

Browse files
committedJan 27, 2023
Mostly minor typographical and grammar revisions following proof re-read
1 parent 0c88909 commit 0872692

File tree

1 file changed

+285
-219
lines changed

1 file changed

+285
-219
lines changed
 

‎CRAMcodecs.tex

+285-219
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@ \subsection{Pseudocode introduction}
134134
have many models. Here we use an object oriented way of describing
135135
the problem with $instance$.\textsc{Function} notation.
136136

137+
Note some functions may return multiple items, such as
138+
\texttt{return (}\textit{value, length}\texttt{)}, but the calling
139+
code may assign a single variable to this result. In this case the first
140+
value \textit{value} will be used and \textit{length} will be discarded.
141+
137142
\subsection{Mathematical operators}
138143

139144
\begin{tabular}{rl}
@@ -154,9 +159,9 @@ \subsection{Mathematical operators}
154159
$a \bitor b$ & Bit-wise OR operator, joining values $a$, $b$\\
155160
$a \logor b$ & Logical OR operator, joining expressions $a$, $b$\\
156161
$a \logand b$ & Logical AND operator, joining expressions $a$, $b$\\
157-
$a \concat b$ & String concatenation of $a$ and $b$: $ab$.\\
158-
$V_i$ & Element $i$ of vector $V$.\\
159-
& The entire vector $V$ may be passed into a function.\\
162+
$a \concat b$ & String concatenation of $a$ and $b$: $ab$\\
163+
$V_i$ & Element $i$ of vector $V$\\
164+
& The entire vector $V$ may be passed into a function\\
160165
$W_{i,j}$ & Element $i,j$ of two-dimensional vector $W$.\\
161166
& The entire vector $W$ or a one dimensional slice $W_i$ (of size $j$) may be passed into a function.\\
162167
\hline
@@ -199,10 +204,12 @@ \subsection{Implicit functions}
199204

200205
\subsection{Other basic functions}
201206

207+
7-bit integer encoding stores values 7-bits at a time with the top bit
208+
set if further bytes are required.
209+
202210
\begin{algorithmic}[1]
203211
\Statex
204-
\Statex \textit{Read a variable sized unsigned integer 7-bits at a time.}
205-
\Statex \textit{Returns the value and number of bytes read, but caller is permitted to only use value in assignments.}
212+
\Statex (Read a variable sized unsigned integer 7-bits at a time. Returns the value.)
206213
\Function{ReadUint7}{$source$} \Comment{If $source$ is unspecified then it is the default input stream}
207214
\State $value \gets 0$
208215
\State $length \gets 0$
@@ -211,18 +218,51 @@ \subsection{Other basic functions}
211218
\State $value \gets (value \shiftl 7) + (c \bitand 127)$
212219
\State $length \gets length + 1$
213220
\Until{$c < 128$}
214-
\State \Return ($value$, $length$) \Comment{or just $value$ if caller uses only that}
221+
\State \Return $value$
222+
\EndFunction
223+
\end{algorithmic}
224+
225+
ITF8 integer encoding stores the additional number of bytes needed in
226+
the count of the top bits set in the initial byte (ending with a zero
227+
bit), followed by any subsequent whole bytes. See the main CRAM
228+
specification for more details.
229+
230+
\begin{algorithmic}[1]
231+
\Statex
232+
\Statex (Read a variable sized unsigned integer with ITF8 encoding. Returns the value.)
233+
\Function{ReadITF8}{$source$} \Comment{If $source$ is unspecified then it is the default input stream}
234+
\State $v \gets$ \Call{ReadUint8}{}
235+
\If{$i >= \mathtt{0xf0}$}\Comment{1111xxxx => +4 bytes}
236+
\State $v \gets (v\ \bitand \mathtt{0x0f}) \shiftl 28$
237+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 20)$
238+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 12)$
239+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 4)$
240+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftr 4)$
241+
\ElsIf{$i >= \mathtt{0xe0}$}\Comment{1110xxxx => +3 bytes}
242+
\State $v \gets (v\ \bitand \mathtt{0x0f}) \shiftl 24$
243+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 16)$
244+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 8)$
245+
\State $v \gets v + $ \Call{ReadUint8}{}
246+
\ElsIf{$i >= \mathtt{0xc0}$}\Comment{110xxxxx => +2 bytes}
247+
\State $v \gets (v\ \bitand \mathtt{0x1f}) \shiftl 16$
248+
\State $v \gets v + ($ \Call{ReadUint8}{} $\shiftl 8)$
249+
\State $v \gets v + $ \Call{ReadUint8}{}
250+
\ElsIf{$i >= \mathtt{0x80}$}\Comment{10xxxxxx => +1 bytes}
251+
\State $v \gets (v\ \bitand \mathtt{0x3f}) \shiftl 8$
252+
\State $v \gets v + $ \Call{ReadUint8}{}
253+
\EndIf
254+
\State \Return $v$
215255
\EndFunction
216256
\end{algorithmic}
217257

218-
\section{rANS 4x8 - Asymmetric Numeral System}
258+
\section{rANS 4x8 - Asymmetric Numeral Systems}
219259

220260
% Lifted over from CRAMv3.tex
221261

222262
This is the rANS format first defined in CRAM v3.0.
223263

224264
rANS is the range-coder variant of the Asymmetric Numerical
225-
System\footnote{J. Duda, \textit{Asymmetric numeral systems: entropy
265+
Systems\footnote{J. Duda, \textit{Asymmetric numeral systems: entropy
226266
coding combining speed of Huffman coding with compression rate of
227267
arithmetic coding}, \url{http://arxiv.org/abs/1311.2540}}.
228268

@@ -257,9 +297,9 @@ \subsubsection*{\textbf{rANS 4x8 compressed data structure}}
257297
\hline
258298
byte & order & the order of the codec, either 0 or 1\tabularnewline
259299
\hline
260-
int & compressed size & the size in bytes of frequency table and compressed blob\tabularnewline
300+
uint32 & compressed size & the size in bytes of frequency table and compressed blob\tabularnewline
261301
\hline
262-
int & data size & raw or uncompressed data size in bytes\tabularnewline
302+
uint32 & data size & raw or uncompressed data size in bytes\tabularnewline
263303
\hline
264304
byte[] & frequency table & byte frequencies of input data written using RLE\tabularnewline
265305
\hline
@@ -269,12 +309,20 @@ \subsubsection*{\textbf{rANS 4x8 compressed data structure}}
269309

270310
\subsection{\textbf{Frequency table}}
271311

272-
The alphabet used here is simply byte values, so a maximum of 256
273-
symbols as some values may not be present.
312+
The alphabet used here has a maximum of 256 possible symbols (all byte
313+
values), but alphabets where fewer symbols are permitted too.
274314

275315
The symbol frequency table indicates which symbols are present and
276316
what their relative frequencies are. The total sum of symbol
277-
frequencies are normalised to add up to 4095.
317+
frequencies are normalised to add up to 4095\footnote{While the maths
318+
work fine up to 4096, for historical reasons this has always been
319+
documented as having a limit of 4095. Implementations may wish to
320+
validate decoding on $<= 4096$, but we recommend they use a limit of
321+
4095 in their encoding output.}. Given rounding differences when
322+
renormalising to a fixed sum, it is up to the encoder to decide how to
323+
distribute any remainder or remove excess frequencies. The normalised
324+
frequency tables below are examples and not prescriptive of a specific
325+
normalisation strategy.
278326

279327
Formally, this is an ordered alphabet $\mathbb{A}$ containing symbols $s$ where
280328
$s_{i}$ with the $i$-th symbol in $\mathbb{A}$, occurring with the frequency $freq_{i}$.
@@ -431,9 +479,9 @@ \subsubsection*{Order-1 encoding}
431479
used for each rANS state, which will not be used as a context. In
432480
extreme cases this may even be the only time that symbols occurs
433481
anywhere. While these scenarios represent unnecessary data to store,
434-
their presence does not invalidate the data format and it may be
435-
simpler to use a more naive algorithm when producing the frequency
436-
tables.
482+
and these frequency entries can be safely omitted, their presence does
483+
not invalidate the data format and it may be simpler to use a more
484+
naive algorithm when producing the frequency tables.
437485

438486

439487
The above tables are encoded as:
@@ -468,10 +516,9 @@ \subsubsection*{Order-1 encoding}
468516
0x00 # end of contexts
469517
\end{verbatim}
470518

471-
\newpage
472519
\subsection{rANS entropy encoding}
473520

474-
The encoder takes a symbol $s$ and a current state $x$ (initially zero) to
521+
The encoder takes a symbol $s$ and a current state $x$ (initially $L$ below) to
475522
produce a new state $x'$ with function $C$.
476523

477524
{
@@ -543,7 +590,7 @@ \subsection{rANS entropy encoding}
543590
}
544591

545592

546-
The $x' = C(s,x)$ function used for the ${i}-th symbol ${s} is:
593+
The $x' = C(s,x)$ function used for the $i$-th symbol $s$ is:
547594

548595
{
549596
\setlength{\parindent}{1cm}
@@ -568,8 +615,8 @@ \subsection{rANS entropy encoding}
568615
at a time (encoded and decoded in reverse order).
569616

570617
Before every encode $C(s,x)$ we renormalise $x$, shifting out the bottom 8
571-
bits of $x$ until $x < \mathtt{0x80000} \times freq_i$. After finishing encoding we
572-
flush 4 more bytes (lowest 8-bits first) from $x$.
618+
bits of $x$ until $x < \mathtt{0x80000} \times freq_i$. After
619+
finishing all encoding we flush 4 more bytes (lowest 8-bits first) from $x$.
573620

574621
After every decoded $D(x')$ we renormalise $x'$, shifting in the bottom 8
575622
bits until $x \geq \mathtt{0x800000}$.
@@ -585,16 +632,16 @@ \subsubsection*{Interleaving}
585632
(so the output bytes get interleaved).
586633

587634
For the Order-1 codec we cannot do this as we need to know the
588-
previous byte value as the context for the next byte. Therefore split
635+
previous byte value as the context for the next byte. We therefore split
589636
the input data into 4 approximately equal sized
590637
fragments\footnote{This was why the `\textbackslash0' $\to$ `a'
591638
context in the example above had a frequency of 4 instead of 1.}
592639
starting at $0$, $\lfloor{}len/4\rfloor{}$,
593640
$\lfloor{}len/4\rfloor{}\times2$ and $\lfloor{}len/4\rfloor{}\times 3$. Each
594641
Order-1 codec operates in a cyclic fashion as with Order-0, all
595-
starting with 0 as their state and sharing the same output buffer. Any
596-
remainder, when the input buffer is not divisible by 4, is processed at
597-
the end by the 4th rANS state.
642+
starting with 0 as their state and sharing the same compressed output
643+
buffer. Any remainder, when the input buffer is not divisible by 4, is
644+
processed at the end by the 4th rANS state.
598645

599646
We do not permit Order-1 encoding of data streams smaller than 4
600647
bytes.
@@ -603,7 +650,7 @@ \subsection{rANS decode pseudocode}
603650

604651
A na\"ive implementation of a rANS decoder follows.
605652
This pseudocode is for clarity only and is not expected to be performant and we would normally rewrite this to use lookup tables for maximum efficiency.
606-
The function \textsc{ReadUint8} below is undefined, but is expected to fetch the next single unsigned byte from an unspecified input source. Similarly for \textsc{ReadITF8} (variable size inetger) and \textsc{ReadUint32} (32-bit unsigned integer in little endian format).
653+
The function \textsc{ReadUint8} fetches the next single unsigned byte from an unspecified input source. Similarly for \textsc{ReadITF8} (variable size integer) and \textsc{ReadUint32} (32-bit unsigned integer in little endian format).
607654

608655
\vskip 0.5cm
609656

@@ -630,16 +677,15 @@ \subsubsection*{rANS order-0}
630677
\vskip 0.5cm
631678

632679
\begin{algorithmic}[1]
633-
\Statex (Reads a table of Order-0 symbol frequencies $F_i$
680+
\Statex (Reads a table of Order-0 symbol frequencies $F_i$)
634681
\Statex (and sets the cumulative frequency table $C_{i+1} = C_i+F_i$)
635-
\Procedure{ReadFrequencies0}{$F, C$}
682+
\Procedure{ReadFrequencies0}{$F,\ C$}
636683
\State $s \gets$ \Call{ReadUint8}{}\Comment{Next alphabet symbol}
637684
\State $last\_sym \gets s$
638685
\State $rle \gets 0$
639686
\Repeat
640-
\State $f \gets$ \Call{ReadITF8}{}
641687
\settowidth{\maxwidth}{$C_s$}
642-
\State \algalign{F_s}{\gets} $f$
688+
\State \algalign{F_s}{\gets} \Call{ReadITF8}{}
643689
\If{$rle > 0$}
644690
\settowidth{\maxwidth}{rle\ }
645691
\State \algalign{rle}{\gets} $rle-1$
@@ -690,7 +736,7 @@ \subsubsection*{rANS order-0}
690736
\Statex
691737
\Procedure{RansDecode0}{$output$, $nbytes$}
692738
\State \Call{ReadFrequencies0}{$F, C$}
693-
\For{$j\gets 0 \algorithmicto 3$}\Comment{4 interleaved streams}
739+
\For{$j\gets 0 \algorithmicto 3$}\Comment{Initialise the 4 interleaved streams}
694740
\State $R_j \gets$ \Call{ReadUint32}{}\Comment{Unsigned 32-bit little endian}
695741
\EndFor
696742
\For{$i\gets 0 \algorithmicto nbytes-1$}
@@ -707,7 +753,7 @@ \subsubsection*{rANS order-0}
707753
\subsubsection*{rANS order-1}
708754

709755
As described above, the decode logic is very similar to rANS Order-0 except we have a two dimensional array of frequencies to read and the decode uses the last character as the context for decoding the next one.
710-
In the pseudocode we demonstrate this by using two dimensional vectors $C_{i,j}$ and $F_{i,j}$.
756+
In the pseudocode we illustrate this by using two dimensional vectors $C_{i,j}$ and $F_{i,j}$.
711757
For simplicity, we reuse the Order-0 code by referring to $C_i$ and $F_i$ of the 2D vectors to get a single dimensional vector that operates in the same manner as the Order-0 code.
712758
This is not necessarily the most efficient implementation.
713759

@@ -717,9 +763,9 @@ \subsubsection*{rANS order-1}
717763
\vskip 0.5cm
718764

719765
\begin{algorithmic}[1]
720-
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$
766+
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$)
721767
\Statex (and sets the cumulative frequency table $C_{i,j+1} = C_{i,j}+F_{i,j}$)
722-
\Procedure{ReadFrequencies1}{$F, C$}
768+
\Procedure{ReadFrequencies1}{$F,\ C$}
723769
\State $sym \gets$ \Call{ReadUint8}{}\Comment{Next alphabet symbol}
724770
\State $last\_sym \gets sym$
725771
\State $rle \gets 0$
@@ -742,7 +788,7 @@ \subsubsection*{rANS order-1}
742788
\Statex
743789
\Procedure{RansDecode1}{$output$, $nbytes$}
744790
\State \Call{ReadFrequencies1}{$F, C$}
745-
\For{$j\gets 0 \algorithmicto 3$}\Comment{4 interleaved streams}
791+
\For{$j\gets 0 \algorithmicto 3$}\Comment{Initialise 4 interleaved streams}
746792
\State $R_j \gets$ \Call{ReadUint32}{}\Comment{Unsigned 32-bit little endian}
747793
\State $L_j \gets 0$\Comment{Last symbol}
748794
\EndFor
@@ -791,15 +837,59 @@ \section{rANS Nx16}
791837
Frequencies are now stored using uint7 format instead of ITF8. The
792838
tables are also stored differently, separating the list of symbols
793839
present in the alphabet (those with frequency greater than zero) from
794-
the frequencies themselves. The symbol list must be stored in
795-
ascending ASCII order, with their frequency values in the same
796-
ordering as their corresponding symbols. For the Order-1 frequency
797-
table this list of symbols is those used in any context, thus we only
798-
have one alphabet recorded for all contexts. This means in some
799-
contexts some (potentially many) symbols will have zero frequency. To
800-
reduce the Order-1 table size an additional zero run-length encoding
801-
step is used. Finally the Order-1 frequencies may optionally be
802-
compressed using the Order-0 rANS Nx16 codec.
840+
the frequencies themselves.
841+
842+
Finally transformations may be applied to the data prior to
843+
compression (or after decompression). These consist of stripe, for
844+
structured data where every Nth byte is sent to one of N separate
845+
compression streams, Run Length Encoding replacing repeated strings of
846+
symbols with a symbol and count, and bit-packing where reduced
847+
alphabets can combine multiple symbols into a byte prior to entropy
848+
encoding.
849+
850+
The initial ``Order'' byte is expanded with additional bits to list
851+
the transformations to be applied. The specifics of each sub-format
852+
are listed below, in the order they are applied.
853+
854+
\begin{itemize}
855+
\item{\textbf{\textsc{Stripe}}:}
856+
rANS Nx16 with multi-way interleaving (see Section~\ref{sec:ransstripe}).
857+
858+
\item{\textbf{\textsc{NoSize}}:}
859+
Do not store the size of the uncompressed data stream.
860+
This information is not required when the data stream is one of the four sub-streams in the \textsc{Stripe} format.
861+
862+
\item{\textbf{\textsc{Cat}}:}
863+
If present, the order bit flag is ignored.
864+
865+
The uncompressed data stream is the same as the compressed stream.
866+
This is useful for very short data where the overheads of compressing are too high.
867+
868+
\item{\textbf{\textsc{N32}}:}
869+
Flag indicating whether to interleave 4 or 32 rANS states.
870+
871+
\item{\textbf{\textsc{Order}}:}
872+
Bit field defining order-0 (unset) or order-1 (set) entropy encoding, as described above by the \textsc{RansDecodeNx16\_0} and \textsc{RansDecodeNx16\_1} functions.
873+
874+
\item{\textbf{\textsc{RLE}}:}
875+
Bit field defining whether Run Length Encoding has been applied to the data. If set, the reverse transorm will be applied using \textsc{DecodeRLE} after Order-0 or Order-1 uncompression (see Section~\ref{sec:ransRLE}).
876+
877+
\item{\textbf{\textsc{Pack}}:}
878+
Bit field indicating the data was packed prior to compression (see Section~\ref{sec:ranspack}). If set, unpack the bits after any RLE decoding has been applied (if required) using the \textsc{DecodePack} function.
879+
\end{itemize}
880+
881+
\subsection{Frequency tables}
882+
883+
Frequency tables in rANS Nx16 separate the list of symbols from their
884+
frequencies. The symbol list must be stored in ascending ASCII order,
885+
with their frequency values in the same ordering as their
886+
corresponding symbols. For the Order-1 frequency table this list of
887+
symbols is those used in any context, thus we only have one alphabet
888+
recorded for all contexts. This means in some contexts some
889+
(potentially many) symbols will have zero frequency. To reduce the
890+
Order-1 table size an additional zero run-length encoding step is
891+
used. Finally the Order-1 frequencies may optionally be compressed
892+
using the Order-0 rANS Nx16 codec.
803893

804894
Frequencies must always add up to a power of 2, but do not necessarily
805895
have to match the final power of two used in the Order-0 (4096) and
@@ -808,8 +898,6 @@ \section{rANS Nx16}
808898
This is required as the Order-1 frequencies may be scaled differently
809899
for each context.
810900

811-
\subsection{Frequency tables}
812-
813901
\begin{algorithmic}[1]
814902
\Statex (Reads a set of symbols $A$ used in our alphabet)
815903
\Function{ReadAlphabet}{}
@@ -837,9 +925,9 @@ \subsection{Frequency tables}
837925
\vskip 0.5cm
838926

839927
\begin{algorithmic}[1]
840-
\Statex (Reads a table of Order-0 symbol frequencies $F_i$
928+
\Statex (Reads a table of Order-0 symbol frequencies $F_i$)
841929
\Statex (and sets the cumulative frequency table $C_{i+1} = C_i+F_i$)
842-
\Procedure{ReadFrequenciesNx16\_0}{$F, C$}
930+
\Procedure{ReadFrequenciesNx16\_0}{$F,\ C$}
843931
\State $F \gets (0,\ ...)$ \Comment(Set to zero for all $i \in \{0, 1,
844932
..., 255\}$)
845933
\State $A \gets$ \Call{ReadAlphabet}{}
@@ -858,7 +946,7 @@ \subsection{Frequency tables}
858946

859947
\begin{algorithmic}[1]
860948
\Statex (Normalises a table of frequencies $F_i$ to sum to a specified power of 2.)
861-
\Procedure{NormaliseFrequenciesNx16\_0}{$F$, $bits$}
949+
\Procedure{NormaliseFrequenciesNx16\_0}{$F,\ bits$}
862950
\State $tot \gets 0$
863951
\For{$i\gets 0 \algorithmicto 255$}
864952
\State $tot \gets tot + F_i$
@@ -900,9 +988,9 @@ \subsection{Frequency tables}
900988
% unspecified inputs or specified ones?
901989

902990
\begin{algorithmic}[1]
903-
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$
991+
\Statex (Reads a table of Order-1 symbol frequencies $F_{i,j}$)
904992
\Statex (and sets the cumulative frequency table $C_{i,j+1} = C_{i,j}+F_{i,j}$)
905-
\Procedure{ReadFrequenciesNx16\_1}{$F, C, bits$}
993+
\Procedure{ReadFrequenciesNx16\_1}{$F,\ C,\ bits$}
906994
\State $comp \gets$ \Call{ReadUint8}{}
907995
\State $bits \gets comp \shiftr 4$
908996
\If{$(comp \logand 1) \ne 0$}
@@ -948,16 +1036,17 @@ \subsection{rANS Nx16 Order-0}
9481036
interleave to different amounts.
9491037

9501038
\begin{algorithmic}[1]
951-
\Function{RansGetCumulativeFreqNx16}{$R, bits$}
1039+
\Function{RansGetCumulativeFreqNx16}{$R,\ bits$}
9521040
\State \Return $R\ \bitand ((1 \shiftl bits) -1)$
9531041
\EndFunction
1042+
\Statex
9541043
\Function{RansAdvanceStepNx16}{$R, c, f, bits$}
9551044
\State \Return $f \times (R \shiftr bits) + (R\ \bitand ((1 \shiftl bits) -1) - c$
9561045
\EndFunction
9571046
\Statex
958-
\Function{RansRenormNx16}{$source$, $R$}
1047+
\Function{RansRenormNx16}{$R$}
9591048
\If{$R < (1 \shiftl 15)$}
960-
\State $R \gets (R \shiftl 16) +$\ \Call{ReadUint16}{$source$}
1049+
\State $R \gets (R \shiftl 16) +$\ \Call{ReadUint16}{}
9611050
\EndIf
9621051
\State \Return $R$
9631052
\EndFunction
@@ -993,7 +1082,7 @@ \subsection{rANS Nx16 Order-1}
9931082
more complex too.
9941083

9951084
\begin{algorithmic}[1]
996-
\Function{RansDecodeNx16\_1}{$len, N$}
1085+
\Function{RansDecodeNx16\_1}{$len,\ N$}
9971086
\State \Call{ReadFrequenciesNx16\_1}{$F$, $C$, $bits$}
9981087
\For{$j \gets 0 \algorithmicto N-1$}
9991088
\State $R_j \gets$ \Call{ReadUint32}{}
@@ -1028,11 +1117,11 @@ \subsection{rANS Nx16 Run Length Encoding}
10281117

10291118
For symbols that occur many times in succession, we can replace them
10301119
with a single symbol and a count. In this specification, run lengths
1031-
are always provided for certain symbol values (even if the run length
1120+
are always provided for certain symbols (even if the run length
10321121
is 1) and never for the other symbols (even if many are consecutive).
10331122

1034-
The data stream is split into two: meta-data holding run-lengths
1035-
and the run-removed data itself.
1123+
The data stream is split into two parts: the meta-data holding
1124+
run-lengths and the run-removed data itself.
10361125

10371126
\begin{table}[h]
10381127
\centering
@@ -1056,14 +1145,17 @@ \subsection{rANS Nx16 Run Length Encoding}
10561145
\Statex (Reads and optionally uncompresses the blob of run-lengths and the array $L$)
10571146
\Statex (indicating which symbols have associates run-lengths.)
10581147
\Function{DecodeRLEMeta}{$N$}
1148+
\State $L \gets (0,\ ...)$ \Comment(Set to zero for all $i \in \{0, 1,
1149+
..., 255\}$)
10591150
\State $rle\_meta\_len \gets $\Call{ReadUint7}{}
10601151
\State $len \gets $\Call{ReadUint7}{} \Comment{Length of uncompressed O0/O1 data, pre-expansion}
10611152
\If{$rle\_meta\_len \bitand 1$}
10621153
\State $rle\_meta \gets $\Call{ReadData}{$\lfloor{}rle\_meta\_len/2\rfloor{}$}
10631154
\Else
10641155
\State $comp\_meta\_len \gets $\Call{ReadUint7}{}
10651156
\State $rle\_meta \gets $\Call{ReadData}{$comp\_meta\_len$}
1066-
\State $rle\_meta \gets $\Call{RansDecodeNx16\_0}{$rle\_meta\_len/2$, $source = rle\_meta$, $N$}
1157+
\State $rle\_meta \gets
1158+
$\Call{RansDecodeNx16\_0}{$rle\_meta\_len/2$, $N$, $source = rle\_meta$, $source = rle\_meta$}
10671159
\EndIf
10681160

10691161
\Statex
@@ -1085,7 +1177,7 @@ \subsection{rANS Nx16 Run Length Encoding}
10851177

10861178
\begin{algorithmic}[1]
10871179
\Statex (Expands data ($in$) using run-length metadata)
1088-
\Function{DecodeRLE}{$in$, $L$, $metadata$, $in\_len$}
1180+
\Function{DecodeRLE}{$in,\ L,\ metadata,\ in\_len$}
10891181
\State $j \gets 0$
10901182
\For{$i \gets 0 \algorithmicto in\_len - 1$}
10911183
\State $sym \gets$ \Call{ReadUint8}{$in$}
@@ -1125,7 +1217,7 @@ \subsection{rANS Nx16 Bit Packing}
11251217
\hline
11261218
1 & byte & $nsym$ & Number of distinct symbols\\
11271219
$nsym$ & byte[] & $P$ & Symbol map \\
1128-
-? & uint7 & $len$ & Length of packed data
1220+
? & uint7 & $len$ & Length of packed data
11291221
\end{tabular}
11301222
\end{table}
11311223

@@ -1157,7 +1249,7 @@ \subsection{rANS Nx16 Bit Packing}
11571249
data as described above.
11581250

11591251
\begin{algorithmic}[1]
1160-
\Function{DecodePack}{$data$, $P$, $nsym$, $len$}
1252+
\Function{DecodePack}{$data,\ P,\ nsym,\ len$}
11611253
\State $j \gets 0$ \Comment{Index into $data$; $i$ is index into output}
11621254
\If{$nsym \le 1$} \Comment{Constant value}
11631255
\For{$i \gets 0$ to $len-1$}
@@ -1206,33 +1298,33 @@ \subsection{rANS Nx16 Bit Packing}
12061298
\subsection{Striped rANS Nx16}
12071299
\label{sec:ransstripe}
12081300

1209-
If we have a series of 32-bit values, we can get better compression by
1301+
If we have a series of 32-bit values, we can often get better compression by
12101302
treating it as a series of 4 8-bit values representing the first to
12111303
last bytes in each 32-bit word, than we can by simply processing it as
12121304
a stream of 8-bit values.
12131305
Each $4{th}$ byte is sent to its own stream producing 4 interleaved streams, so the $1^{st}$ stream will hold data from byte 0, 4, 8, etc while the $2^{nd}$ stream will hold data from byte 1, 5, 9, etc.
12141306
Each of those four streams is then itself compressed using this compression format.
12151307

1216-
For example an input block of small unsigned 32-bit little-endian numbers may use RLE for the first three streams as they are mostly zero, and a non-RLE Order-0 entropy encoder of the last stream.
1308+
For example an input block of small unsigned 32-bit little-endian numbers may use RLE for the first three streams as they are mostly zero, and a non-RLE Order-0 entropy encoder for the last stream.
12171309

1218-
In the general case we describe this as $X$-way interleaved streams.
1310+
In the general case we describe this as $N$-way interleaved streams.
12191311
We can consider this interleaving process to be equivalent to a table
1220-
transpose of $Y$ rows by $X$ columns to $X$ rows by $Y$ columns,
1221-
followed by compressing each $X$ row independently.
1312+
transpose of $M$ rows by $N$ columns to $N$ rows by $M$ columns,
1313+
followed by compressing each $N$ row independently.
12221314

12231315
The byte stream consists of a 7-bit encoded uncompressed combined
1224-
length, a byte holding the value of $X$, followed by $X$ compressed
1316+
length, a byte holding the value of $N$, followed by $N$ compressed
12251317
lengths also 7-bit encoded. Finally the data sub-streams themselves,
12261318
each a valid $cdata$ stream, follow.
12271319

12281320
Normally our $cdata$ format will include the decoded size, but with
12291321
\textsc{Stripe} we can omit this from the internal compressed sub-streams
1230-
as given the total length we know how to compute the sub-lengths.
1322+
(using the \textsc{NoSize} flag) as given the total length we know how to compute the sub-lengths.
12311323

1232-
Reproducing the original uncompressed data involves decoding the $X$
1324+
Reproducing the original uncompressed data involves decoding the $N$
12331325
sub-streams and interleaving them together again (reversing the table
12341326
transpose). The uncompressed data length may not necessary be an exact
1235-
multiple of $X$, in which case the latter uncompressed sub-streams may
1327+
multiple of $N$, in which case the latter uncompressed sub-streams may
12361328
be 1 byte shorter.
12371329

12381330
As an example starting with input data $D$ we define the transposed data $T$ as:
@@ -1243,18 +1335,18 @@ \subsection{Striped rANS Nx16}
12431335
\hspace{1cm}
12441336
$T = [\ abcde,\ ABCD,\ \underline{A}\underline{B}\underline{C}\underline{D}\ ]$
12451337

1246-
Note our example data is not a multiple of $X$ long, missing
1338+
Note our example data is not a multiple of $N$ long, missing
12471339
$E\underline{E}$, which gives $T$ fragments of length [5, 4, 4].
12481340

12491341
If $D_i$ is the $i^{th}$ character in $D$ and $T_{j,i}$ is the
12501342
$i^{th}$ character of the $j^{th}$ substring in $T$, transformations
12511343
between $D$ and $T$ are defined as:
12521344

12531345
\hspace{1cm}
1254-
$T_{j,i} = D_{i X +j}$
1346+
$T_{j,i} = D_{i N +j}$
12551347

12561348
\hspace{1cm}
1257-
$D_i = T_{(i \bmod X),\ (i \bdiv X)}$
1349+
$D_i = T_{(i \bmod N),\ (i \bdiv N)}$
12581350

12591351

12601352
% Example:
@@ -1329,24 +1421,24 @@ \subsection{Striped rANS Nx16}
13291421
\vskip 0.5cm
13301422

13311423
\begin{algorithmic}[1]
1332-
\Function{RansDecodeStripe}{$len, N$}
1333-
\State $X \gets $\Call{ReadUint8}{}
1334-
\For{$j \gets 0$ to $X$} \Comment{Fetch X compressed lengths}
1424+
\Function{RansDecodeStripe}{$len$}
1425+
\State $N \gets $\Call{ReadUint8}{}
1426+
\For{$j \gets 0$ to $N$} \Comment{Fetch N compressed lengths}
13351427
\State $clen_j \gets $\Call{ReadUint7}{}
13361428
\EndFor
13371429
\Statex
1338-
\For{$j \gets 0$ to $X$} \Comment{Decode X streams}
1339-
\State $ulen_j \gets (len \bdiv X) + ((len \bmod X) > j)$
1430+
\For{$j \gets 0$ to $N$} \Comment{Decode N streams}
1431+
\State $ulen_j \gets (len \bdiv N) + ((len \bmod N) > j)$
13401432
\Comment{$(x > y)$ expression being 1 if true, 0 if false}
13411433
\State $T_j \gets $\Call{RansDecodeNx16}{$ulen_j$}
13421434
\EndFor
13431435
\Statex
13441436
% \For{$i \gets 0$ to $len - 1$} \Comment{Interleave}
1345-
% \State $out_i \gets T_{(i \bmod X),\ (i \bdiv X)}$
1437+
% \State $out_i \gets T_{(i \bmod N),\ (i \bdiv N)}$
13461438
% \EndFor
1347-
\For{$j \gets 0$ to $X - 1$} \Comment{Stripe}
1439+
\For{$j \gets 0$ to $N - 1$} \Comment{Stripe}
13481440
\For{$i \gets 0$ to $ulen_j - 1$}
1349-
\State $out_{i \times X + j} \gets T_{j,i}$
1441+
\State $out_{i \times N + j} \gets T_{j,i}$
13501442
\EndFor
13511443
\EndFor
13521444
\State \Return $out$
@@ -1387,9 +1479,9 @@ \subsection{Combined rANS Nx16 Format}
13871479
\multicolumn{6}{|l|}{}\\[-0.3em]
13881480
\multicolumn{6}{|l|}{\textit{If \textsc{Stripe} flag is set:} } \\
13891481
\cline{2-5}
1390-
& ? & uint8 & X & Number of sub-streams & \\
1391-
& ? & uint7[] & clen[] & X copies of compressed sub-block length & \\
1392-
& ? & uint8[] & cdata[] & X copies of Compressed data sub-block (recurse) & \\
1482+
& 8 & uint8 & N & Number of sub-streams & \\
1483+
& ? & uint7[] & clen[] & N copies of compressed sub-block length & \\
1484+
& ? & uint8[] & cdata[] & N copies of Compressed data sub-block (recurse) & \\
13931485
\cline{2-5}
13941486

13951487
\multicolumn{6}{|l|}{}\\[-0.7em]
@@ -1407,7 +1499,7 @@ \subsection{Combined rANS Nx16 Format}
14071499
\multicolumn{6}{|l|}{}\\[-0.7em]
14081500
\multicolumn{6}{|l|}{\textit{If \textsc{RLE} flag is set (and neither \textsc{Stripe} or \textsc{Cat} flags are set):} } \\
14091501
\cline{2-5}
1410-
& ? & uint8[] & rle\_meta & RLE meta-data.\\
1502+
& ? & uint8[] & rle\_meta & RLE meta-data\\
14111503
\cline{2-5}
14121504

14131505
\multicolumn{6}{|l|}{}\\[-0.7em]
@@ -1428,19 +1520,18 @@ \subsection{Combined rANS Nx16 Format}
14281520
\hline
14291521
\textbf{Bit AND value} & \textbf{Code} & \textbf{Description} \\
14301522
\hline
1431-
1 & \textsc{Order} & Order-0 or Order-1 entropy coding. \\
1523+
1 & \textsc{Order} & Order-0 or Order-1 entropy coding\\
14321524
2 & reserved & Reserved (for possible order-2/3)\\
14331525
4 & \textsc{N32} & Interleave $N=32$ rANS states (else $N=4$)\\
1434-
8 & \textsc{Stripe}\tnote{\textbf{a}} & multi-way interleaving of byte streams.\\
1526+
8 & \textsc{Stripe}\tnote{\textbf{$*$}} & multi-way interleaving of byte streams\\
14351527
16 & \textsc{NoSize} & original size is not recorded (for use by \textsc{Stripe})\\
14361528
32 & \textsc{Cat} & Data is uncompressed\\
14371529
64 & \textsc{RLE} & Run length encoding, with runs and literals encoded separately\\
1438-
128 & \textsc{Pack} & Pack 2, 4, 8 or infinite symbols per byte.\\
1530+
128 & \textsc{Pack} & Pack 2, 4, 8 or infinite symbols per byte\\
14391531
\hline
14401532
\end{tabular}
14411533
\begin{tablenotes}
1442-
\item{\textbf{a.}} Not to be used in conjunction with other bit-field
1443-
values except \textsc{NoSize}.
1534+
\item{\footnotesize{($*$)}} \footnotesize{Not to be used in conjunction with other bit-field values except \textsc{NoSize}.}
14441535
\end{tablenotes}
14451536
\end{threeparttable}
14461537

@@ -1499,41 +1590,11 @@ \subsection{Combined rANS Nx16 Format}
14991590
\EndFunction
15001591
\end{algorithmic}
15011592

1502-
The specifics of each sub-format are described below, in the order (minus meta-data specific shuffling) they are applied.
1503-
1504-
\begin{itemize}
1505-
\item{\textbf{\textsc{Stripe}}:}
1506-
rANS Nx16 with multi-way interleaving (see Section~\ref{sec:ransstripe}).
1507-
1508-
\item{\textbf{\textsc{NoSize}}:}
1509-
Do not store the size of the uncompressed data stream.
1510-
This information is not required when the data stream is one of the four sub-streams in the \textsc{Stripe} format.
1511-
1512-
\item{\textbf{\textsc{Cat}}:}
1513-
If present, the order bit flag is ignored.
1514-
1515-
The uncompressed data stream is the same as the compressed stream.
1516-
This is useful for very short data where the overheads of compressing are too high.
1517-
1518-
\item{\textbf{\textsc{N32}}:}
1519-
Flag indicating whether to interleave 4 or 32 rANS states.
1520-
1521-
\item{\textbf{\textsc{Order}}:}
1522-
Bit field defining order-0 (unset) or order-1 (set) entropy encoding, as described above by the \textsc{RansDecodeNx16\_0} and \textsc{RansDecodeNx16\_1} functions.
1523-
1524-
\item{\textbf{\textsc{RLE}}:}
1525-
Bit field defining whether Run Length Encoding has been applied to the data. If set, the reverse transorm will be applied using \textsc{DecodeRLE} after Order-0 or Order-1 uncompression (see Section~\ref{sec:ransRLE}).
1526-
1527-
\item{\textbf{\textsc{Pack}}:}
1528-
Bit field indicating the data was packed prior to compression (see Section~\ref{sec:ranspack}). If set, unpack the bits after any RLE decoding has been applied (if required) using the \textsc{DecodePack} function.
1529-
1530-
\end{itemize}
1531-
15321593
\section{Range coding}
15331594

15341595
The range coder is a byte-wise arithmetic coder that operates by
15351596
repeatedly reducing a probability range (for example 0.0 to 1.0) one
1536-
symbol (byte) at a time with the complete compressed data can be
1597+
symbol (byte) at a time, with the complete compressed data being
15371598
represented by any value within the final range.
15381599

15391600
This is easiest demonstrated with a worked example, so let us imagine
@@ -1563,27 +1624,25 @@ \section{Range coding}
15631624
table footnotes below for the worked mathematics.
15641625

15651626
\begin{threeparttable}[t]
1566-
\begin{tabular}{rrrrr}
1627+
\begin{tabular}{rrrrrrr}
15671628
\hline
1568-
\textbf{Range low} & \textbf{Range high} & \textbf{Symbol} & \textbf{Symbol low} & \textbf{Symbol high}\\
1629+
\textbf{Range low} & \textbf{/ high} & \textbf{Symbol} & \textbf{Sym. low} & \textbf{/ high} & \textbf{New range low} & \textbf{New range high}\\
15691630
\hline
1570-
0.000 & 1.000 & c & 0.2 & 0.5\\
1571-
0.200 & 0.500 & a & 0.8 & 1.0\\
1572-
0.440\tnote{\textbf{a}} & 0.500\tnote{\textbf{a}} & t & 0.0 & 0.2\\
1631+
0.000 & 1.000 & c & 0.2 & 0.5 & $0+(1-0)\times.2$ & $0+(1-0)\times.5$\\
1632+
0.200 & 0.500 & a & 0.8 & 1.0 & $.2+(.5-.2)\times.8$& $.2+(.5-.2)\times 1$\\
1633+
0.440 & 0.500 & t & 0.0 & 0.2 & $.44+(.5-.44)\times 0$ & $.44+(.5-.44)\times .2$\\
15731634
0.440 & 0.452 & <end>\\
15741635
\hline
15751636
\end{tabular}
1576-
\begin{tablenotes}
1577-
\item{\textbf{a.}} Old range 0.2 to 0.5 plus symbol range 0.8 to 1.0 gives an updated range of 0.44 to 0.5:\\
1578-
$0.2 + 0.8\times(0.5-0.2) = 0.44$\\
1579-
$0.2 + 1.0\times(0.5-0.2) = 0.50$
1580-
\end{tablenotes}
15811637
\end{threeparttable}
15821638

15831639
Our final range is 0.44 to 0.452 with any value in that range representing
15841640
``cat'', thus 0.45 would suffice. A pictorial example of this process is below.
15851641

1642+
\begin{figure}[h]
15861643
\includegraphics[height=250pt, keepaspectratio=true]{img/range_code.png}
1644+
\caption{A pictorial demonstration of range reduction.}
1645+
\end{figure}
15871646

15881647
Decoding is simply the reverse of this. In the above picture we can see that 0.45 would read off `c', `a' and `t' by repeatedly comparing the symbol ranges to the current range and using those to identify the symbol and produce a new range.
15891648

@@ -1640,7 +1699,7 @@ \section{Range coding}
16401699
\end{algorithmic}
16411700

16421701
\begin{algorithmic}[1]
1643-
\Procedure{RangeDecode}{$sym\_low, sym\_freq, tot\_freq$}
1702+
\Procedure{RangeDecode}{$sym\_low,\ sym\_freq,\ tot\_freq$}
16441703
\settowidth{\maxwidth}{range\ }
16451704
\State \algalign{code}{\gets} $code - sym\_low \times range$
16461705
\State \algalign{range}{\gets} $range \times sym\_freq$
@@ -1660,7 +1719,7 @@ \section{Range coding}
16601719
The \textsc{RangeEncode} function is a straight forward reversal of the \textsc{RangeDecode}, with the exception of the special code for shifting the top byte out of the $low$ variable.
16611720

16621721
\begin{algorithmic}[1]
1663-
\Procedure{RangeEncode}{$sym\_low, sym\_freq, tot\_freq$}
1722+
\Procedure{RangeEncode}{$sym\_low,\ sym\_freq,\ tot\_freq$}
16641723
\settowidth{\maxwidth}{old\_low\ }
16651724
\State \algalign{old\_low}{\gets} $low$
16661725
\State \algalign{range}{\gets} $range \bdiv tot\_freq$
@@ -1730,7 +1789,7 @@ \section{Range coding}
17301789
\end{algorithmic}
17311790

17321791

1733-
\subsection{Statistical Modelling}
1792+
\subsection{Adaptive Modelling}
17341793

17351794
The probabilities passed to the range coder may be fixed for all scenarios (as we had in the ``cat'' example), or they may be adaptive and context aware.
17361795
For example the letter `u' occurs around 3\% of time in English text, but if the previous letter was `q' it is close to 100\% and if the previous letter was `u' it is close to 0\%.
@@ -1861,10 +1920,9 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
18611920
(if $\ge 4$) and 257 for any further continuation runs. Thus encoding
18621921
10 `A' characters would first store symbol `A' followed by run length
18631922
3 (with context `A'), length 3 (context 256), length 3 (context
1864-
257), and length 1 (context 258).
1923+
257), and length 1 (context 257).
18651924

1866-
For example, if we have the string ``RRRRUNN'' we will decode
1867-
symbol `R' run 3, symbol `U' run 0, symbol `N' run 1.
1925+
For example, if we have the string ``ABBCCCCDDDDD'' we will record ``A''<0> ``B''<1> ``C''<3,0> and ``D''<3,1>.
18681926

18691927
\begin{algorithmic}[1]
18701928
\Function{DecodeRLE0}{$len$}
@@ -1962,7 +2020,7 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
19622020
\multicolumn{6}{|l|}{}\\[-0.3em]
19632021
\multicolumn{6}{|l|}{\textit{If \textsc{Stripe} flag is set:} } \\
19642022
\cline{2-5}
1965-
& ? & uint8 & X & Number of sub-streams & \\
2023+
& 8 & uint8 & N & Number of sub-streams & \\
19662024
& ? & uint7[] & clen[] & N copies of compressed sub-block length & \\
19672025
& ? & uint8[] & cdata[] & N copies of Compressed data sub-block (recurse) & \\
19682026
\cline{2-5}
@@ -1997,20 +2055,19 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
19972055
\hline
19982056
\textbf{Bit AND value} & \textbf{Code} & \textbf{Description} \\
19992057
\hline
2000-
1 & \textsc{Order}\tnote{\textbf{a}} & Order-0 or Order-1 entropy coding. \\
2058+
1 & \textsc{Order}\tnote{\textbf{$*$}} & Order-0 or Order-1 entropy coding\\
20012059
2 & reserved & Reserved (for possible order-2/3)\\
20022060
4 & \textsc{Ext} & ``External'' compression via bzip2\\
2003-
8 & \textsc{Stripe}\tnote{\textbf{b}} & N-way interleaving of byte streams.\\
2004-
16 & \textsc{NoSize} & original size is not recorded (used by \textsc{Stripe})\\
2005-
32 & \textsc{Cat}\tnote{\textbf{b}} & Data is uncompressed\\
2006-
64 & \textsc{RLE}\tnote{\textbf{a}} & Run length encoding, with runs and literals encoded separately\\
2007-
128 & \textsc{Pack} & Pack 2, 4, 8 or infinite symbols per byte.\\
2061+
8 & \textsc{Stripe}\tnote{\textbf{\dag}} & N-way interleaving of byte streams\\
2062+
16 & \textsc{NoSize} & Original size is not recorded (used by \textsc{Stripe})\\
2063+
32 & \textsc{Cat}\tnote{\textbf{\dag}} & Data is uncompressed\\
2064+
64 & \textsc{RLE}\tnote{\textbf{$*$}} & Run length encoding, with runs and literals encoded separately\\
2065+
128 & \textsc{Pack} & Pack 2, 4, 8 or infinite symbols per byte\\
20082066
\hline
20092067
\end{tabular}
20102068
\begin{tablenotes}
2011-
\item{\textbf{a.}} Has no effect when \textsc{Ext} flag is set.
2012-
\item{\textbf{b.}} Not to be used in conjunction with other flags
2013-
except \textsc{Pack} and \textsc{NoSize}.
2069+
\item{\footnotesize{($*$)}} \footnotesize{Has no effect when \textsc{Ext} flag is set.}
2070+
\item{\footnotesize{(\dag)}} \footnotesize{Not to be used in conjunction with other flags except \textsc{Pack} and \textsc{NoSize}.}
20142071
\end{tablenotes}
20152072
\end{threeparttable}
20162073

@@ -2031,37 +2088,37 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
20312088
\begin{algorithmic}[1]
20322089
\Function{ArithDecode}{$len$}
20332090
\State $flags \gets $\Call{ReadUint8}{}
2034-
\If{$flags \bitand$ \textsc{NoSize} $\ne 0$}
2091+
\If{$flags\ \bitand$ \textsc{NoSize} $\ne 0$}
20352092
\State $len \gets$\Call{ReadUint7}{}
20362093
\EndIf
2037-
\If{$flags \bitand$ \textsc{Stripe}}
2038-
\State $data \gets $\Call{DecodeSTRIPE}{$len$}
2094+
\If{$flags\ \bitand$ \textsc{Stripe}}
2095+
\State $data \gets $\Call{DecodeStripe}{$len$}
20392096
\State \Return $data$
20402097
\EndIf{}
2041-
\If{$flags \bitand$ \textsc{Pack}}
2098+
\If{$flags\ \bitand$ \textsc{Pack}}
20422099
\State $pack\_len \gets len$
20432100
\State $(P,\ nsym,\ len) \gets $\Call{DecodePackMeta}{}
20442101
\EndIf
20452102
\Statex \Comment{Entropy Decoding}
2046-
\If{$flags \bitand$ \textsc{Cat}}
2103+
\If{$flags\ \bitand$ \textsc{Cat}}
20472104
\State $data \gets $\Call{ReadData}{$len$}
2048-
\ElsIf{$flags \bitand$ \textsc{Ext}}
2105+
\ElsIf{$flags\ \bitand$ \textsc{Ext}}
20492106
\State $data \gets $\Call{DecodeEXT}{$len$}
2050-
\ElsIf{$flags \bitand$ \textsc{RLE}}
2051-
\If{$flags \bitand$ \textsc{Order}}
2107+
\ElsIf{$flags\ \bitand$ \textsc{RLE}}
2108+
\If{$flags\ \bitand$ \textsc{Order}}
20522109
\State $data \gets $\Call{DecodeRLE1}{$len$}
20532110
\Else
20542111
\State $data \gets $\Call{DecodeRLE0}{$len$}
20552112
\EndIf
20562113
\Else
2057-
\If{$flags \bitand$ \textsc{Order}}
2114+
\If{$flags\ \bitand$ \textsc{Order}}
20582115
\State $data \gets $\Call{DecodeOrder1}{$len$}
20592116
\Else
20602117
\State $data \gets $\Call{DecodeOrder0}{$len$}
20612118
\EndIf
20622119
\EndIf
20632120
\Statex \Comment{Apply data transformations}
2064-
\If{$flags \bitand$ \textsc{Pack}}
2121+
\If{$flags\ \bitand$ \textsc{Pack}}
20652122
\State $data \gets $\Call{DecodePack}{$data$, $P$, $nsym$, $pack\_len$}
20662123
\EndIf
20672124
\State \Return $data$
@@ -2086,7 +2143,7 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
20862143
their own algorithm.
20872144

20882145
\begin{algorithmic}[1]
2089-
\Function{RansDecodeStripe}{$len$}
2146+
\Function{DecodeStripe}{$len$}
20902147
\State $N \gets $\Call{ReadUint8}{}
20912148
\For{$j \gets 0$ to $N$} \Comment{Fetch N compressed lengths}
20922149
\State $clen_j \gets $\Call{ReadUint7}{}
@@ -2101,7 +2158,7 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
21012158
% \For{$i \gets 0$ to $len - 1$} \Comment{Interleave}
21022159
% \State $out_i \gets T_{(i \bmod N),\ (i \bdiv N)}$
21032160
% \EndFor
2104-
\For{$j \gets 0$ to $N - 1$} \Comment{Interleave}
2161+
\For{$j \gets 0$ to $N - 1$} \Comment{Stripe}
21052162
\For{$i \gets 0$ to $ulen_j - 1$}
21062163
\State $out_{i \times N + j} \gets T_{j,i}$
21072164
\EndFor
@@ -2161,7 +2218,7 @@ \subsection{RLE with Order-0 and Order-1 Encoding}
21612218
\section{Name tokenisation codec}
21622219

21632220
Sequence names (identifiers) typically follow a structured pattern and compression based on columns within those structures usually leads to smaller sizes.
2164-
The sequence name (identifier) tokenisation relies heavily on the General Purpose Entropy Encoder described above.
2221+
The sequence name (identifier) tokenisation relies heavily on the rANS Nx16 and Adaptive arithmetic coders described above.
21652222

21662223
As an example, take a series of names:
21672224

@@ -2191,17 +2248,17 @@ \section{Name tokenisation codec}
21912248
\hline
21922249
\textbf{ID} & \textbf{Type} & \textbf{Value} & \textbf{Description}\\
21932250
\hline
2194-
0 & TYPE & Type & Used to determine the type of token at a given position. \\
2251+
0 & TYPE & Type & Used to determine the type of token at a given position \\
21952252
\hline
2196-
5 & DUP & Integer (distance) & The entire name is a duplicate of an earlier one. Used in position 0 only.\\
2197-
6 & DIFF & Integer (distance) & The entire name is differs to earlier ones. Used in position 0 only.\\
2253+
5 & DUP & Integer (distance) & The entire name is a duplicate of an earlier one. Used in position 0 only\\
2254+
6 & DIFF & Integer (distance) & The entire name differs to earlier ones. Used in position 0 only\\
21982255
\hline
21992256
1 & STRING & String & A nul-terminated string of characters \\
22002257
2 & CHAR & Byte & A single character \\
22012258
7 & DIGITS & $0 \le$ Int $< 2^{32}$ & A numerical value, not containing a leadng zero \\
22022259
3 & DIGITS0 & $0 \le$ Int $< 2^{32}$ & A numerical value possibly starting in leading zeros \\
2203-
4 & DZLEN & Int length & Length of associated DIGITS0 token.\\
2204-
8 & DELTA & $0 \le$ Int $< 256$ & A numeric value being stored as the difference to the numeric value of this token on the previous name. \\
2260+
4 & DZLEN & Int length & Length of associated DIGITS0 token\\
2261+
8 & DELTA & $0 \le$ Int $< 256$ & A numeric value being stored as the difference to the numeric value of this token on the previous name \\
22052262
9 & DELTA0 & $0 \le$ Int $< 256$ & As DELTA, but for numeric values starting with leading zeros \\
22062263
10 & MATCH & (none) & This token is identical type and value to the same position in the previous name (NB: not permitted for DELTA/DELTA0)\\
22072264
11 & NOP & (none) & Does nothing\\
@@ -2238,7 +2295,7 @@ \section{Name tokenisation codec}
22382295
These types are fetched for position 0, at the start of each new
22392296
identifier. The value is an integer value describing how many reads
22402297
before this (with 1 being the immediately previous name) we are
2241-
comparing against. When we refer to ``previous name'' below, we
2298+
comparing against. When we subsequently refer to ``previous name'' below, we
22422299
always mean the one indicated by the DIFF field and not the one
22432300
immediately prior to the current name.
22442301

@@ -2427,18 +2484,18 @@ \section{Name tokenisation codec}
24272484
\cline{3-6}
24282485
& \multicolumn{5}{l|}{\textit{else if not duplicate}}\\
24292486
\cline{3-6}
2430-
& & ? & i7 & $clen$ & compressed length (7-bit encoding)\\
2431-
& & $clen$ & $cdata$ & stream & compressed data stream\\
2487+
& & ? & uint7 & $clen$ & compressed length\\
2488+
& & $clen$ & uint8[] & $cdata$ & compressed data stream\\
24322489
\hline
24332490
\end{tabular}
24342491

24352492
A few tricks are used to remove some byte streams. In addition to the explicit marking of duplicate bytes streams, if a byte stream of token types is entirely MATCH apart from the very first value it is discarded. It is possible to regenerate this during decode by observing the other byte streams. For example if we have a byte stream $B_{5,DIGITS}$ but no $B_{5,TYPE}$ then we assume the contents of $B_{5,TYPE}$ consist of one DIGITS type followed by as many MATCH types as are needed.
24362493

2437-
The $cdata$ stream itself is as described in the General Purpose Entropy Encoder section above, with the \textsc{ArithDecode} function.
2494+
The $cdata$ stream itself is as described in the relevant entropy encoder section above (rANS or arithmetic coding).
24382495

24392496
\begin{algorithmic}[1]
24402497
\Statex
2441-
\Statex \textit{(Decodes and uncompressed the serialised token byte streams)}
2498+
\Statex \textit{(Decodes and uncompresses the serialised token byte streams)}
24422499
\Function{DecodeTokenByteStreams}{$use\_arith$}
24432500
\State $sz \gets 0$
24442501
\State $t \gets -1$
@@ -2462,9 +2519,9 @@ \section{Name tokenisation codec}
24622519
\State $clen \gets$ \Call{ReadUint7}{}
24632520
\State $data \gets$ \Call{ReadData}{$clen$}
24642521
\If{$use\_arith$}
2465-
\State $B_{t,type} \gets$ \Call{ArithDecode}{}
2522+
\State $B_{t,type} \gets$ \Call{ArithDecode}{$clen,\ source=data$}
24662523
\Else
2467-
\State $B_{t,type} \gets$ \Call{RansDecodeNx16}{}
2524+
\State $B_{t,type} \gets$ \Call{RansDecodeNx16}{$clen,\ source=data$}
24682525
\EndIf
24692526
\EndIf
24702527
\Until{\Call{EOF}{}}
@@ -2474,7 +2531,7 @@ \section{Name tokenisation codec}
24742531

24752532
\begin{algorithmic}[1]
24762533
\Statex
2477-
\Statex \textit{(Decodes the $n^{th}$ name, returning $N_n$ and updating $N_n$ and $T_n$)}
2534+
\Statex \textit{(Decodes all names, returning $N$)}
24782535
\Function{DecodeNames}{}
24792536
\State $ulen \gets$ \Call{ReadUint32}{}
24802537
\State $nnames \gets$ \Call{ReadUint32}{}
@@ -2527,12 +2584,13 @@ \section{FQZComp quality codec}
25272584
sequence is the second in a pair, and a running total of number of
25282585
times the quality has changed in this sequence).
25292586

2530-
For each quality value, the models produce probabilities for all
2531-
possible next quality values, which are passed into an arithmetic
2532-
entropy encoder to encode or decode the actual next quality value.
2533-
The models are then updated based on the actual next quality in order
2534-
to learn the statistical properties of the quality data stream. This
2535-
step wise update process is identical for both encoding and decoding.
2587+
For each position along the sequence, the models produce probabilities
2588+
for all possible next quality values, which are passed into an
2589+
arithmetic entropy encoder to encode or decode the actual next quality
2590+
value. The models are then updated based on the actual next quality
2591+
in order to learn the statistical properties of the quality data
2592+
stream. This step wise update process is identical for both encoding
2593+
and decoding.
25362594

25372595
The algorithm is a generalisation on the original fqzcomp program,
25382596
described in \textit{Compression of FASTQ and SAM Format Sequencing
@@ -2552,7 +2610,7 @@ \subsection{FQZComp Models}
25522610

25532611
The parameter selector model also has no context associated with it
25542612
and encodes $max\_sel$ distinct values. The selector value may be
2555-
quantised further using $stab$ to reduce the selector to fewer
2613+
quantised further using $stab$ (Selector Table) to reduce the selector to fewer
25562614
sets of parameters. This is useful if we wish to use the selector
25572615
bits directly in the context using the same parameters. The selector
25582616
is arbitrary and may be used for distinguishing READ1 from READ2, as
@@ -2569,7 +2627,31 @@ \subsection{FQZComp Models}
25692627
There are 4 read length models each having $max\_sym$ of 256. Each
25702628
model is used for the 4 successive bytes in a 32-bit length value.
25712629

2572-
\begin{center}
2630+
The entropy encoder used is shared between all models, so the bit
2631+
streams are multiplexed together.
2632+
2633+
The 16-bit quality value context is constructed by adding sub-contexts
2634+
together consisting of previous quality values, position along the
2635+
current record, a running count (per record) of how many times the
2636+
quality value has differed to the previous one (delta), and an
2637+
arbitrary stored selector value, each shifted to a defined location
2638+
within the combined context value ($qloc$, $ploc$, $dloc$ and
2639+
$sloc$ respectively). The qual, pos and delta sub-contexts are
2640+
computed from the previous data while the selector, if used, is read
2641+
directly from the compressed data stream. The selector may be used to
2642+
switch parameter sets, or simply to group quality strings into
2643+
arbitrary user-defined sub-sets. The numeric values for each of these
2644+
components can be passed through lookup tables ($qtab$ for quality,
2645+
$ptab$ for positions, $dtab$ for running delta and $stab$ for turning
2646+
the selector $s$ into a parameter index $x$). These all convert the
2647+
monotonically increasing range 0$\rightarrow$M to a (usually smaller)
2648+
monotonically increasing 0$\rightarrow$N. For example if we wish to
2649+
use the approximate position along a 100 byte string, we may uniformly
2650+
map 0$\rightarrow$127 to 0$\rightarrow$15 to utilise 4 bits of our
2651+
16-bit combined context.
2652+
2653+
\begin{figure}[h]
2654+
\centering
25732655
\begin{tikzpicture}[
25742656
boxed/.style={rectangle, draw=black, text width=1cm},
25752657
boxed1/.style={rectangle, draw=black},
@@ -2646,42 +2728,20 @@ \subsection{FQZComp Models}
26462728
\node[below right,boxed1] (d3) at (d2.south west) {\footnotesize{3}};
26472729

26482730
\end{tikzpicture}
2649-
\end{center}
2650-
2651-
The entropy encoder used is shared between all models, so the bit
2652-
streams are multiplexed together.
2653-
2654-
The 16-bit quality value context is constructed by adding sub-contexts
2655-
together consisting of previous quality values, position along the
2656-
current record, a running count (per record) of how many times the
2657-
quality value has differed to the previous one (delta), and an
2658-
arbitrary stored selector value, each shifted to a defined location
2659-
within the combined context value ($qloc$, $ploc$, $dloc$ and
2660-
$sloc$ respectively). The qual, pos and delta sub-contexts are
2661-
computed from the previous data while the selector, if used, is read
2662-
directly from the compressed data stream. The selector may be used to
2663-
switch parameter sets, or simply to group quality strings into
2664-
arbitrary user-defined sub-sets. The numeric values for each of these
2665-
components can be passed through lookup tables ($qtab$ for quality,
2666-
$ptab$ for positions, $dtab$ for running delta and $stab$ for turning
2667-
the selector $s$ into a parameter index $x$). These all convert the
2668-
monotonically increasing range 0$\rightarrow$M to a (usually smaller)
2669-
monotonically increasing 0$\rightarrow$N. For example if we wish to
2670-
use the approximate position along a 100 byte string, we may uniformly
2671-
map 0$\rightarrow$127 to 0$\rightarrow$15 to utilise 4 bits of our
2672-
16-bit combined context.
2731+
\caption{An example FQZComp configuration.}
2732+
\end{figure}
26732733

26742734
As some sequencing instruments produce binned qualities, e.g. 0, 10, 25,
26752735
35, these values are squashed to incremental values from 0 to
26762736
$max\_sym-1$ where $max\_sym$ is the maximum number of distinct
26772737
quality values observed. If this transform is required, the flag
26782738
$have\_qmap$ will be set and a mapping table ($qmap$) will hold the
2679-
original quality values. The decoded qualities will be the smaller
2739+
original quality values. The encoded qualities will be the smaller
26802740
mapped range.
26812741

26822742
The quality sub-context is constructed by shifting left the previous
26832743
quality sub-context by $qshift$ bits and adding the current quality
2684-
after passing through the $qmap$ squashing process and if defined
2744+
after passing through the $qmap$ transform and if defined
26852745
through the $qtab$ lookup table. The quality context is limited to
26862746
$qbits$ long and is added to the combined context starting at bit
26872747
$qloc$. The quality sub-context is reset to zero at the start of each
@@ -2718,7 +2778,9 @@ \subsection{FQZComp Models}
27182778
$pos$, $delta$, $prevq$, $qctx$ and $sel$ parameters referred are global and updateable.
27192779

27202780
\begin{algorithmic}[1]
2721-
\Function{FQZUpdateContext}{$params, q$}
2781+
\Statex
2782+
\Statex (Add quality $q$ to produce and return a new context $ctx$)
2783+
\Function{FQZUpdateContext}{$params,\ q$}
27222784
\State $ctx \gets params.context$ \Comment{Also the initial value}
27232785
\State $qctx \gets (qctx \shiftl params.qshift) + qtab_q$
27242786
\State $ctx \gets ctx + ((qctx \bitand (2^{params.qbits}-1)) \shiftl params.qloc)$
@@ -2762,7 +2824,7 @@ \subsection{FQZComp Data Stream}
27622824
The start of an FQZComp data stream consists of the parameters used by
27632825
the decoder. The data layout is as follows.
27642826

2765-
\begin{table}
2827+
\begin{table}[H]
27662828
\centering
27672829
\begin{tabular}{|r|r|r|r|r|p{8cm}|l|l|}
27682830
\hline
@@ -2773,7 +2835,7 @@ \subsection{FQZComp Data Stream}
27732835
\multicolumn{3}{|r|}{8} & uint8 & $gflags$ & \multicolumn{3}{p{8.8cm}|}{Global FQZcomp bit-flags. From lowest bit to highest:}\\
27742836
\multicolumn{3}{|r|}{} & & & \multicolumn{3}{p{8.8cm}|}{1: $multi\_param$: indicates more than one parameter block is present. Otherwise set $nparam = 1$} \\
27752837
\multicolumn{3}{|r|}{} & & & \multicolumn{3}{p{8.8cm}|}{2: $have\_stab$: indicates the parameter selector is mapped through $stab$. Otherwise set $stab_i = i$} \\
2776-
\multicolumn{3}{|r|}{} & & & \multicolumn{3}{p{8.8cm}|}{4: $do\_rev$: $model\_revcomp$ will be used. (CRAM v3.1)} \\
2838+
\multicolumn{3}{|r|}{} & & & \multicolumn{3}{p{8.8cm}|}{4: $do\_rev$: $model\_revcomp$ will be used (CRAM v3.1)} \\
27772839
\hline
27782840

27792841
\multicolumn{8}{|l|}{}\\[-0.7em]
@@ -2799,8 +2861,8 @@ \subsection{FQZComp Data Stream}
27992861
& \multicolumn{2}{r|}{8} & uint8 & $pflags$ & \multicolumn{2}{p{8.4cm}|}{Per-parameter block bit-flags. From lowest bit to highest:} & \\
28002862
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{1: Reserved} & \\
28012863
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{2: $do\_dedup$: model\_dup will be used} & \\
2802-
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{4: $do\_len$: model\_len will be used for every record.} & \\
2803-
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{8: $do\_sel$: model\_sel will be used.} & \\
2864+
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{4: $do\_len$: model\_len will be used for every record} & \\
2865+
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{8: $do\_sel$: model\_sel will be used} & \\
28042866
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{16: $have\_qmap$: indicates quality map is present} & \\
28052867
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{32: $have\_ptab$: Load $ptab$, otherwise position contexts are unused} & \\
28062868
& \multicolumn{2}{r|}{} & & & \multicolumn{2}{p{8.4cm}|}{64: $have\_dtab$: Load $dtab$, otherwise delta contexts are unused} & \\
@@ -2878,11 +2940,12 @@ \subsection{FQZComp Data Stream}
28782940
% FIXME: our worked example should include actual bytes for qmap, ptab
28792941
% and dtab too.
28802942

2881-
2943+
\pagebreak
28822944
\textsc{FQZDecodeParams} below describes the pseudocode for reading
28832945
the parameter block.
28842946

28852947
\begin{algorithmic}[1]
2948+
\Statex
28862949
\Procedure{FQZDecodeParams}{}
28872950
\State $vers \gets $\Call{ReadUint8}{}
28882951
\If{$vers \ne 5$}
@@ -2911,6 +2974,7 @@ \subsection{FQZComp Data Stream}
29112974
\end{algorithmic}
29122975

29132976
\begin{algorithmic}[1]
2977+
\Statex
29142978
\Function{FQZDecodeSingleParam}{}
29152979
\settowidth{\maxwidth}{p.have\_qtab\ }
29162980
\State \algalign{p.context}{\gets} \Call{ReadUint16}{}
@@ -3010,6 +3074,7 @@ \subsection{FQZComp Data Stream}
30103074
then $A$. The following pseudocode demonstrates this process.
30113075

30123076
\begin{algorithmic}[1]
3077+
\Statex
30133078
\Function{ReadArray}{n}
30143079
\State $i,j,z \gets 0$
30153080
\State $last \gets -1$
@@ -3048,7 +3113,7 @@ \subsection{FQZComp Data Stream}
30483113
\EndFunction
30493114
\end{algorithmic}
30503115

3051-
The main loop decodes data in the following order per read: read
3116+
The FQZComp main loop decodes data in the following order per read: read
30523117
length (if not fixed), the flag for whether this is read 2 (if
30533118
needed), a bit flag to indicate if the quality is duplicated (if
30543119
needed), followed by record length number of quality values using
@@ -3068,6 +3133,7 @@ \subsection{FQZComp Data Stream}
30683133
\algnewcommand{\Label}{\State\unskip}
30693134

30703135
\begin{algorithmic}[1]
3136+
\Statex
30713137
\Function{FQZNewRecord}{}
30723138
\State $sel \gets 0$
30733139
\State $x \gets 0$
@@ -3083,9 +3149,7 @@ \subsection{FQZComp Data Stream}
30833149
\If{$param.do\_len \logor param.first\_len$} \Comment{Decode read length}
30843150
\State $rec\_len \gets $\Call{DecodeLength}{rc}
30853151
\State $param.last\_len \gets rec\_len$
3086-
\If{$param.do\_len = 0$}
3087-
\State $param.first\_len = 0$
3088-
\EndIf
3152+
\State $param.first\_len = 0$
30893153
\Else
30903154
\State $rec\_len \gets param.last\_len$
30913155
\EndIf
@@ -3160,7 +3224,8 @@ \subsection{FQZComp Data Stream}
31603224
Read lengths are encoded as 4 8-bit bytes, each having its own model.
31613225

31623226
\begin{algorithmic}[1]
3163-
\Function{DecodeLength}{rc}
3227+
\Statex
3228+
\Function{DecodeLength}{$rc$}
31643229
\State $rec\_len \gets model\_len_0.$\Call{ModelDecode}{$rc$}
31653230
\State $rec\_len \gets rec\_len + (model\_len_1.$\Call{ModelDecode}{$rc$}$ \shiftl 8)$
31663231
\State $rec\_len \gets rec\_len + (model\_len_2.$\Call{ModelDecode}{$rc$}$ \shiftl 16)$
@@ -3176,6 +3241,7 @@ \subsection{FQZComp Data Stream}
31763241
\textsc{ReverseQualities} procedure called below after decoding.
31773242

31783243
\begin{algorithmic}[1]
3244+
\Statex
31793245
\Procedure{ReverseQualities}{$qual,\ qual\_len,\ rev,\ len$}
31803246
\State $rec \gets 0$
31813247
\State $i \gets 0$

0 commit comments

Comments
 (0)
Please sign in to comment.