; Surprise!  I don't know what assembler you are using (masm, tasm,
; nasm, jasm, a86, etc.) so you get to wrap either of these routines for your
; specific situation.  Be sure to read the TRASHES comment below.
;
; The speed-optimized routine is lz4_decompress and the size-optimized
; routine is lz4_decompress_small.

COMMENT #
function lz4_decompress(inb,outb:pointer):word

Decompresses an LZ4 stream file with a compressed chunk 64K or less in size.
Input:
  DS:SI Location of source data.  DWORD magic header and DWORD chunk size
        must be intact; it is best to load the entire LZ4 file into this
        location before calling this code.

Output:
  ES:DI Decompressed data.  If using an entire 64K segment, decompression
        is "safe" because overruns will wrap around the segment.
  AX    Size of decompressed data.

Trashes AX, BX, CX, DX, SI, DI
        ...so preserve what you need before calling this code.
#

asm
        jmp     @decompinit
@SHR4table:
        DB 00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00
        DB 01,01,01,01,01,01,01,01,01,01,01,01,01,01,01,01
        DB 02,02,02,02,02,02,02,02,02,02,02,02,02,02,02,02
        DB 03,03,03,03,03,03,03,03,03,03,03,03,03,03,03,03
        DB 04,04,04,04,04,04,04,04,04,04,04,04,04,04,04,04
        DB 05,05,05,05,05,05,05,05,05,05,05,05,05,05,05,05
        DB 06,06,06,06,06,06,06,06,06,06,06,06,06,06,06,06
        DB 07,07,07,07,07,07,07,07,07,07,07,07,07,07,07,07
        DB 08,08,08,08,08,08,08,08,08,08,08,08,08,08,08,08
        DB 09,09,09,09,09,09,09,09,09,09,09,09,09,09,09,09
        DB 0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A,0A
        DB 0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B,0B
        DB 0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C,0C
        DB 0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D,0D
        DB 0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E,0E
        DB 0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F,0F

@decompinit:
        push    ds              ;preserve compiler assumptions
        push    bp              ;preserve compiler assumptions
        les     di,outb         ;load target buffer
        push    di              ;save original starting offset (in case != 0)
        lds     si,inb          ;load source buffer
        add     si,4            ;skip magic number
        cld                     ;make strings copy forward
        mov     bx,OFFSET @SHR4table ;prepare BX for XLAT later on
        lodsw                   ;load chunk size low 16-bit word
        mov     bp,ax           ;BP = size of compressed chunk
        lodsw                   ;load chunk size high 16-bit word
        add     bp,si           ;BP = threshold to stop decompression
        or      ax,ax           ;is high word non-zero?
        jnz     @done           ;If so, chunk too big or malformed, abort

@starttoken:
        lodsb                   ;grab token to AL
        mov     dx,ax           ;preserve packed token in DX
  segcs xlat                    ;unpack upper 4 bits, faster than SHR reg,cl
        mov     cx,ax           ;CX = unpacked literal length token
        jcxz    @copymatches    ;if CX = 0, no literals; try matches
        cmp     al,0Fh          ;is it 15?
        jne     @doliteralcopy1 ;if so, build full length, else start copying
@build1stcount:                 ;this first count build is not the same
        lodsb                   ;fall-through jump as the one in the main loop
        add     cx,ax           ;because it is more likely that the very first
        cmp     al,FFh          ;length is 15 or more
        je      @build1stcount
@doliteralcopy1:
        rep     movsb           ;src and dst might overlap so do this by bytes

;At this point, we might be done; all LZ4 data ends with five literals and the
;offset token is ignored.  If we're at the end of our compressed chunk, stop.

        cmp     si,bp           ;are we at the end of our compressed chunk?
        jae     @done           ;if so, jump to exit; otherwise, process match

@copymatches:
        lodsw                   ;AX = match offset
        xchg    dx,ax           ;AX = packed token, DX = match offset
        and     al,0Fh          ;unpack match length token
        cmp     al,0Fh          ;is it 15?
        xchg    cx,ax           ;(doesn't affect flags); don't need ax any more
        je      @buildmcount    ;if not, start copying, otherwise build count

@domatchcopy:
        cmp     dx,2            ;if match offset=1 or 2, we're repeating a value
        jbe     @domatchfill    ;if so, perform RLE expansion optimally
        push    ds
        xchg    si,ax           ;ds:si saved
        mov     si,di
        sub     si,dx
        mov     dx,es
        mov     ds,dx           ;ds:si points at match; es:di points at dest
        movsw
        movsw                   ;minimum match is 4 bytes; move them ourselves
        shr     cx,1
        rep     movsw           ;cx contains count-4 so copy the rest
        adc     cx,cx
        rep     movsb
        xchg    si,ax
        pop     ds              ;ds:si restored

@parsetoken:                    ;CX always 0 here because of REP
        xchg    cx,ax           ;zero ah here to benefit other reg loads
        lodsb                   ;grab token to AL
        mov     dx,ax           ;preserve packed token in DX
@copyliterals:                  ;next 5 lines are 8088-optimal, do not rearrange
  segcs xlat                    ;unpack upper 4 bits, faster than SHR reg,cl
        mov     cx,ax           ;CX = unpacked literal length token
        jcxz    @copymatches    ;if CX = 0, no literals; try matches
        cmp     al,0Fh          ;is it 15?
        je      @buildlcount    ;if so, build full length, else start copying
@doliteralcopy:                 ;src and dst might overlap so do this by bytes
        rep     movsb           ;if cx=0 nothing happens

;At this point, we might be done; all LZ4 data ends with five literals and the
;offset token is ignored.  If we're at the end of our compressed chunk, stop.

@testformore:
        cmp     si,bp           ;are we at the end of our compressed chunk?
        jb      @copymatches    ;if not, keep going
        jmp     @done           ;if so, end

@domatchfill:
        je      @domatchfill2   ;if DX=2, RLE by word, else by byte
@domatchfill1:
        mov     al,es:[di-1]    ;load byte we are filling with
        mov     ah,al           ;copy to ah so we can do 16-bit fills
        stosw                   ;minimum match is 4 bytes, so we fill four
        stosw
        inc     cx              ;round up for the shift
        shr     cx,1            ;CX = remaining (count+1)/2
        rep     stosw           ;includes odd byte - ok because LZ4 never ends with matches
        adc     di,-1           ;Adjust dest unless original count was even
        jmp     @parsetoken     ;continue decompressing

@domatchfill2:
        mov     ax,es:[di-2]    ;load word we are filling with
        stosw                   ;minimum match is 4 bytes, so we fill four
        stosw
        inc     cx              ;round up for the shift
        shr     cx,1            ;CX = remaining (count+1)/2
        rep     stosw           ;includes odd byte - ok because LZ4 never ends with matches
        adc     di,-1           ;Adjust dest unless original count was even
        jmp     @parsetoken     ;continue decompressing

@buildlcount:                   ;build full literal length count
        lodsb                   ;get next literal count byte
        add     cx,ax           ;increase count
        cmp     al,FFh          ;more count bytes to read?
        je      @buildlcount
        jmp     @doliteralcopy

@buildmcount:                   ;build full match length count - AX is 0
        lodsb                   ;get next literal count byte
        add     cx,ax           ;increase count
        cmp     al,FFh          ;more count bytes to read?
        je      @buildmcount
        jmp     @domatchcopy

@done:
        pop     ax              ;retrieve previous starting offset
        sub     di,ax           ;subtract prev offset from where we are now
        xchg    ax,di           ;AX = decompressed size
        pop     bp              ;restore compiler assumptions
        pop     ds              ;restore compiler assumptions
end;

COMMENT #

function lz4_decompress_small(inb,outb:pointer):word

Same as LZ4_Decompress but optimized for size, not speed.  Still pretty fast,
although roughly 30% slower than lz4_decompress and RLE sequences are not
optimally handled.  Same Input, Output, and Trashes as lz4_decompress.
Assembles to 79 bytes. Thanks to Peter Ferrie for suggestions!

#

asm
        push    ds              ;preserve compiler assumptions
        les     di,outb         ;load target buffer
        push    di              ;save original starting offset (in case != 0)
        lds     si,inb          ;load source buffer
        cld                     ;make strings copy forward
        lodsw
        lodsw                   ;skip magic number, smaller than "add si,4"
        lodsw                   ;load chunk size low 16-bit word
        xchg    bx,ax           ;BX = size of compressed chunk
        add     bx,si           ;BX = threshold to stop decompression
        lodsw                   ;load chunk size high 16-bit word
        xchg    cx,ax           ;set CX=0 so that AX=0 later
        inc     cx              ;is high word non-zero?
        loop    @done           ;If so, chunk too big or malformed, abort

@parsetoken:                    ;CX=0 here because of REP at end of loop
        xchg    cx,ax           ;zero ah here to benefit other reg loads
        lodsb                   ;grab token to AL
        mov     dx,ax           ;preserve packed token in DX
@copyliterals:
        mov     cl,4
        shr     al,cl           ;unpack upper 4 bits
        call    @buildfullcount ;build full literal count if necessary
@doliteralcopy:                 ;src and dst might overlap so do this by bytes
        rep     movsb           ;if cx=0 nothing happens

;At this point, we might be done; all LZ4 data ends with five literals and the
;offset token is ignored.  If we're at the end of our compressed chunk, stop.

        cmp     si,bx           ;are we at the end of our compressed chunk?
        jae     @done           ;if so, jump to exit; otherwise, process match

@copymatches:
        lodsw                   ;AX = match offset
        xchg    dx,ax           ;AX = packed token, DX = match offset
        and     al,0Fh          ;unpack match length token
        call    @buildfullcount ;build full match count if necessary
@domatchcopy:
        push    ds
        xchg    si,ax           ;ds:si saved
        mov     si,di
        sub     si,dx
        push    es
        pop     ds              ;ds:si points at match; es:di points at dest
        add     cx,4            ;minimum match is 4
        rep     movsb           ;copy match run; movsb handles si=di-1 condition
        xchg    si,ax
        pop     ds              ;ds:si restored
        jmp     @parsetoken

@buildfullcount:
        mov     cx,ax           ;CX = unpacked literal length token
        cmp     al,0Fh          ;is it 15?
        jne     @builddone      ;if not, we have nothing to build
@buildloop:
        lodsb                   ;load a byte
        add     cx,ax           ;add it to the full count
        cmp     al,FFh          ;was it FFh?
        je      @buildloop      ;if so, keep going
@builddone:
        ret

@done:
        pop     ax              ;retrieve previous starting offset
        sub     di,ax           ;subtract prev offset from where we are now
        xchg    ax,di           ;AX = decompressed size
        pop     ds              ;restore compiler assumptions