; vim:filetype=nasm ts=8 ; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; ; - Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; - Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; ; - Neither the name of the Xiph.org Foundation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "nasm.h" data_section cextern FLAC__crc16_table ; unsigned FLAC__crc16_table[256]; cextern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br); cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap code_section ; ********************************************************************** ; ; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter) ; ; Some details like assertions and other checking is performed by the caller. ALIGN 16 cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap ;ASSERT(0 != br); ;ASSERT(0 != br->buffer); ; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion ;ASSERT(FLAC__BITS_PER_WORD == 32); ;ASSERT(parameter < 32); ; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it ;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time ;; [esp + 16] unsigned parameter ;; [esp + 12] unsigned nvals ;; [esp + 8] int vals[] ;; [esp + 4] FLAC__BitReader *br mov eax, [esp + 12] ; if(nvals == 0) test eax, eax ja .nvals_gt_0 mov eax, 1 ; return true; ret .nvals_gt_0: push ebp push ebx push esi push edi sub esp, 4 ;; [esp + 36] unsigned parameter ;; [esp + 32] unsigned nvals ;; [esp + 28] int vals[] ;; [esp + 24] FLAC__BitReader *br ;; [esp] ucbits mov ebp, [esp + 24] ; ebp <- br == br->buffer mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version) mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version) xor edi, edi ; edi <- 0 'uval' ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br ;; [ebp] br->buffer ;; [ebp + 8] br->words ;; [ebp + 12] br->bytes ;; [ebp + 16] br->consumed_words ;; [ebp + 20] br->consumed_bits ;; [ebp + 24] br->read_crc ;; [ebp + 28] br->crc16_align ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; mov eax, [ebp + 8] ; eax <- br->words sub eax, esi ; eax <- br->words-cwords shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits mov [esp], eax ; ucbits <- eax ALIGN 16 .val_loop: ; while(1) { ; ; read unary part ; .unary_loop: ; while(1) { ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ jae near .c1_next1 .c1_loop: ; { mov ebx, [ebp] mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] mov edx, eax ; edx = br->buffer[cwords] (saved for later use) shl eax, cl ; b = br->buffer[cwords] << cbits test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) jz near .c1_next2 ; if(b) { bsr ebx, eax not ebx and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) add ecx, ebx ; cbits += i; add edi, ebx ; uval += i; add ecx, byte 1 ; cbits++; /* skip over stop bit */ test ecx, ~31 jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */ ; crc16_update_word_(br, br->buffer[cwords]); push edi ; [need more registers] bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier mov ecx, [ebp + 28] ; ecx <- br->crc16_align mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE mov edi, _FLAC__crc16_table %else mov edi, FLAC__crc16_table %endif ;; eax (ax) crc a.k.a. br->read_crc ;; ebx (bl) intermediate result index into FLAC__crc16_table[] ;; ecx br->crc16_align ;; edx byteswapped brword to CRC ;; esi cwords ;; edi unsigned FLAC__crc16_table[] ;; ebp br test ecx, ecx ; switch(br->crc16_align) ... jnz .c0b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] .c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] .c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shr edx, 16 .c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] .c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] movzx eax, ax mov [ebp + 24], eax ; br->read_crc <- crc pop edi add esi, byte 1 ; cwords++; xor ecx, ecx ; cbits = 0; ; } jmp near .break1 ; goto break1; ;; this section relocated out of the way for performance .c0b4: mov [ebp + 28], dword 0 ; br->crc16_align <- 0 cmp ecx, 8 je .c0b1 shr edx, 16 cmp ecx, 16 je .c0b2 jmp .c0b3 ;; this section relocated out of the way for performance .c1b4: mov [ebp + 28], dword 0 ; br->crc16_align <- 0 cmp ecx, 8 je .c1b1 shr edx, 16 cmp ecx, 16 je .c1b2 jmp .c1b3 .c1_next2: ; } else { ;; ecx cbits ;; edx current brword 'b' ;; esi cwords ;; edi uval ;; ebp br add edi, 32 sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits; ; crc16_update_word_(br, br->buffer[cwords]); push edi ; [need more registers] bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier mov ecx, [ebp + 28] ; ecx <- br->crc16_align mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE mov edi, _FLAC__crc16_table %else mov edi, FLAC__crc16_table %endif ;; eax (ax) crc a.k.a. br->read_crc ;; ebx (bl) intermediate result index into FLAC__crc16_table[] ;; ecx br->crc16_align ;; edx byteswapped brword to CRC ;; esi cwords ;; edi unsigned FLAC__crc16_table[] ;; ebp br test ecx, ecx ; switch(br->crc16_align) ... jnz .c1b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] .c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] .c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shr edx, 16 .c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] .c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] movzx eax, ax mov [ebp + 24], eax ; br->read_crc <- crc pop edi add esi, byte 1 ; cwords++; xor ecx, ecx ; cbits = 0; ; /* didn't find stop bit yet, have to keep going... */ ; } cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ jb near .c1_loop .c1_next1: ; at this point we've eaten up all the whole words; have to try ; reading through any tail bytes before calling the read callback. ; this is a repeat of the above logic adjusted for the fact we ; don't have a whole word. note though if the client is feeding ; us data a byte at a time (unlikely), br->consumed_bits may not ; be zero. ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br mov edx, [ebp + 12] ; edx <- br->bytes test edx, edx jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ] mov ebx, [ebp] shl edx, 3 ; edx <- const unsigned end = br->bytes * 8; mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] xchg edx, ecx ; [edx <- cbits , ecx <- end] mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end) xchg edx, ecx ; [edx <- end , ecx <- cbits] and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)); shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits; test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) jz .c1_next3 ; if(b) { bsr ebx, eax not ebx and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) add ecx, ebx ; cbits += i; add edi, ebx ; uval += i; add ecx, byte 1 ; cbits++; /* skip over stop bit */ jmp short .break1 ; goto break1; .c1_next3: ; } else { sub edi, ecx add edi, edx ; uval += end - cbits; add ecx, edx ; cbits += end ; /* didn't find stop bit yet, have to keep going... */ ; } ; } .read1: ; flush registers and read; bitreader_read_from_client_() does ; not touch br->consumed_bits at all but we still need to set ; it in case it fails and we have to return false. ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br mov [ebp + 16], esi ; br->consumed_words = cwords; mov [ebp + 20], ecx ; br->consumed_bits = cbits; push ecx ; /* save */ push ebp ; /* push br argument */ %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE call _bitreader_read_from_client_ %else call bitreader_read_from_client_ %endif pop edx ; /* discard, unused */ pop ecx ; /* restore */ mov esi, [ebp + 16] ; cwords = br->consumed_words; ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; mov ebx, [ebp + 8] ; ebx <- br->words sub ebx, esi ; ebx <- br->words-cwords shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval ; + uval to offset our count by the # of unary bits already ; consumed before the read, because we will add these back ; in all at once at break1 mov [esp], ebx ; ucbits <- ebx test eax, eax ; if(!bitreader_read_from_client_(br)) jnz near .unary_loop jmp .end ; return false; /* eax (the return value) is already 0 */ ; } /* end while(1) unary part */ ALIGN 16 .break1: ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br ;; [esp] ucbits sub [esp], edi ; ucbits -= uval; sub dword [esp], byte 1 ; ucbits--; /* account for stop bit */ ; ; read binary part ; mov ebx, [esp + 36] ; ebx <- parameter test ebx, ebx ; if(parameter) { jz near .break2 .read2: cmp [esp], ebx ; while(ucbits < parameter) { jae .c2_next1 ; flush registers and read; bitreader_read_from_client_() does ; not touch br->consumed_bits at all but we still need to set ; it in case it fails and we have to return false. mov [ebp + 16], esi ; br->consumed_words = cwords; mov [ebp + 20], ecx ; br->consumed_bits = cbits; push ecx ; /* save */ push ebp ; /* push br argument */ %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE call _bitreader_read_from_client_ %else call bitreader_read_from_client_ %endif pop edx ; /* discard, unused */ pop ecx ; /* restore */ mov esi, [ebp + 16] ; cwords = br->consumed_words; ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; mov edx, [ebp + 8] ; edx <- br->words sub edx, esi ; edx <- br->words-cwords shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits mov [esp], edx ; ucbits <- edx test eax, eax ; if(!bitreader_read_from_client_(br)) jnz .read2 jmp .end ; return false; /* eax (the return value) is already 0 */ ; } .c2_next1: ;; ebx parameter ;; ecx cbits ;; esi cwords ;; edi uval ;; ebp br ;; [esp] ucbits cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */ jae near .c2_next2 test ecx, ecx ; if(cbits) { jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ mov eax, 32 mov edx, [ebp] sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits; mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords]; cmp ebx, eax ; if(parameter < n) { jae .c2_next4 ; uval <<= parameter; ; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter); shl edx, cl xchg ebx, ecx shld edi, edx, cl add ebx, ecx ; cbits += parameter; xchg ebx, ecx ; ebx <- parameter, ecx <- cbits jmp .break2 ; goto break2; ; } .c2_next4: ; uval <<= n; ; uval |= word & (FLAC__WORD_ALL_ONES >> cbits); %if 1 rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing ; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again %else shl edx, cl %endif xchg eax, ecx shld edi, edx, cl xchg eax, ecx %if 1 ror edx, cl ; restored. %else mov edx, [ebp] mov edx, [edx + 4*esi] %endif ; crc16_update_word_(br, br->buffer[cwords]); push edi ; [need more registers] push ebx ; [need more registers] push eax ; [need more registers] bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier mov ecx, [ebp + 28] ; ecx <- br->crc16_align mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE mov edi, _FLAC__crc16_table %else mov edi, FLAC__crc16_table %endif ;; eax (ax) crc a.k.a. br->read_crc ;; ebx (bl) intermediate result index into FLAC__crc16_table[] ;; ecx br->crc16_align ;; edx byteswapped brword to CRC ;; esi cwords ;; edi unsigned FLAC__crc16_table[] ;; ebp br test ecx, ecx ; switch(br->crc16_align) ... jnz .c2b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] .c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] .c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] shr edx, 16 .c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) movzx ebx, dl mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] .c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) movzx ebx, dh mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] shl eax, 8 ; ax <- (crc<<8) xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] movzx eax, ax mov [ebp + 24], eax ; br->read_crc <- crc pop eax pop ebx pop edi add esi, byte 1 ; cwords++; mov ecx, ebx sub ecx, eax ; cbits = parameter - n; jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */ ; uval <<= cbits; ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); mov eax, [ebp] mov eax, [eax + 4*esi] shld edi, eax, cl ; } jmp .break2 ; goto break2; ;; this section relocated out of the way for performance .c2b4: mov [ebp + 28], dword 0 ; br->crc16_align <- 0 cmp ecx, 8 je .c2b1 shr edx, 16 cmp ecx, 16 je .c2b2 jmp .c2b3 .c2_next3: ; } else { mov ecx, ebx ; cbits = parameter; ; uval <<= cbits; ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); mov eax, [ebp] mov eax, [eax + 4*esi] shld edi, eax, cl jmp .break2 ; goto break2; ; } .c2_next2: ; } else { ; in this case we're starting our read at a partial tail word; ; the reader has guaranteed that we have at least 'parameter' ; bits available to read, which makes this case simpler. ; uval <<= parameter; ; if(cbits) { ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ ; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter); ; cbits += parameter; ; goto break2; ; } else { ; cbits = parameter; ; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits); ; goto break2; ; } ; the above is much shorter in assembly: mov eax, [ebp] mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords] shl eax, cl ; eax <- br->buffer[cwords] << cbits add ecx, ebx ; cbits += parameter xchg ebx, ecx ; ebx <- cbits, ecx <- parameter shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word xchg ebx, ecx ; ebx <- parameter, ecx <- cbits ; } ; } .break2: sub [esp], ebx ; ucbits -= parameter; ; ; compose the value ; mov ebx, [esp + 28] ; ebx <- vals mov edx, edi ; edx <- uval and edi, 1 ; edi <- uval & 1 shr edx, 1 ; edx <- uval >> 1 neg edi ; edi <- -(int)(uval & 1) xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1)) mov [ebx], edx ; *vals <- edx sub dword [esp + 32], byte 1 ; --nvals; jz .finished ; if(nvals == 0) /* jump to finish */ xor edi, edi ; uval = 0; add dword [esp + 28], 4 ; ++vals jmp .val_loop ; } .finished: mov [ebp + 16], esi ; br->consumed_words = cwords; mov [ebp + 20], ecx ; br->consumed_bits = cbits; mov eax, 1 .end: add esp, 4 pop edi pop esi pop ebx pop ebp ret end %ifdef OBJ_FORMAT_elf section .note.GNU-stack noalloc %endif