// ChaCha20 Implementation Template // Complete the TODOs below to implement ChaCha20 // Performs: a += b; d ^= a; d <<<= 16 // c += d; b ^= c; b <<<= 12 // a += b; d ^= a; d <<<= 8 // c += d; b ^= c; b <<<= 7 fn quarterround(reg u32 a, reg u32 b, reg u32 c, reg u32 d) -> reg u32, reg u32, reg u32, reg u32 { a = a + b; d = d ^ a; d = d < reg ptr u32[16] { reg u32 s0 s4 s8 s12; s0 = pstate[0]; s4 = pstate[4]; s8 = pstate[8]; s12 = pstate[12]; s0, s4, s8, s12 = quarterround(s0, s4, s8, s12); pstate[0] = s0; pstate[4] = s4; pstate[8] = s8; pstate[12] = s12; reg u32 s1 s5 s9 s13; s1 = pstate[1]; s5 = pstate[5]; s9 = pstate[9]; s13 = pstate[13]; s1, s5, s9, s13 = quarterround(s1, s5, s9, s13); pstate[1] = s1; pstate[5] = s5; pstate[9] = s9; pstate[13] = s13; reg u32 s2 s6 s10 s14; s2 = pstate[2]; s6 = pstate[6]; s10 = pstate[10]; s14 = pstate[14]; s2, s6, s10, s14 = quarterround(s2, s6, s10, s14); pstate[2] = s2; pstate[6] = s6; pstate[10] = s10; pstate[14] = s14; reg u32 s3 s7 s11 s15; s3 = pstate[3]; s7 = pstate[7]; s11 = pstate[11]; s15 = pstate[15]; s3, s7, s11, s15 = quarterround(s3, s7, s11, s15); pstate[3] = s3; pstate[7] = s7; pstate[11] = s11; pstate[15] = s15; return pstate; } // Diagonal round: applies quarterround to all 4 diagonals // Diagonal 0: (0, 5, 10, 15) // Diagonal 1: (1, 6, 11, 12) // Diagonal 2: (2, 7, 8, 13) // Diagonal 3: (3, 4, 9, 14) fn diagonal_round(reg ptr u32[16] pstate) -> reg ptr u32[16] { reg u32 s0 s5 s10 s15; s0 = pstate[0]; s5 = pstate[5]; s10 = pstate[10]; s15 = pstate[15]; s0, s5, s10, s15 = quarterround(s0, s5, s10, s15); pstate[0] = s0; pstate[5] = s5; pstate[10] = s10; pstate[15] = s15; reg u32 s1 s6 s11 s12; s1 = pstate[1]; s6 = pstate[6]; s11 = pstate[11]; s12 = pstate[12]; s1, s6, s11, s12 = quarterround(s1, s6, s11, s12); pstate[1] = s1; pstate[6] = s6; pstate[11] = s11; pstate[12] = s12; reg u32 s2 s7 s8 s13; s2 = pstate[2]; s7 = pstate[7]; s8 = pstate[8]; s13 = pstate[13]; s2, s7, s8, s13 = quarterround(s2, s7, s8, s13); pstate[2] = s2; pstate[7] = s7; pstate[8] = s8; pstate[13] = s13; reg u32 s3 s4 s9 s14; s3 = pstate[3]; s4 = pstate[4]; s9 = pstate[9]; s14 = pstate[14]; s3, s4, s9, s14 = quarterround(s3, s4, s9, s14); pstate[3] = s3; pstate[4] = s4; pstate[9] = s9; pstate[14] = s14; return pstate; } // void chacha20_block(const uint32_t input[16], uint32_t output[16]) fn chacha20_block(reg ptr u8[64] output_ptr, reg ptr u32[16] input_ptr) -> reg ptr u8[64]{ stack u32[16] state; stack u32[16] initial_state; reg ptr u32[16] pstate; reg ptr u32[16] pinit; reg u32 temp; // Create pointer aliases to enable variable indexing pstate = state; pinit = initial_state; // Load input to both state and initial_state (using loop with pointer aliases) inline int ii; for ii = 0 to 16 { temp = input_ptr[ii]; pstate[ii] = temp; pinit[ii] = temp; } // Perform 10 double rounds (20 rounds total) reg u32 round_count; round_count = 0; while (round_count < 10) { pstate = column_round(pstate); pstate = diagonal_round(pstate); round_count += 1; } // Add initial state to final state and write to output as LE bytes reg u32 final init; reg u32 byte_val; reg u32 byte_idx; reg u32 ff; ff = 0xff; for ii = 0 to 16 { final = pstate[ii]; init = pinit[ii]; final += init; // Write u32 as 4 bytes in little-endian format byte_idx = ii; byte_idx <<= 2; byte_val = (final & ff); output_ptr[byte_idx] = (8u)byte_val; byte_idx += 1; byte_val = (ff & (final >> 8)); output_ptr[byte_idx] = (8u)byte_val; byte_idx += 1; byte_val = (ff & (final >> 16)); output_ptr[byte_idx] = (8u)byte_val; byte_idx += 1; byte_val = (ff & (final >> 24)); output_ptr[byte_idx] = (8u)byte_val; } return output_ptr; } fn chacha20_keysetup(reg ptr u32[16] state_ptr, reg ptr u8[32] sk_ptr) -> reg ptr u32[16]{ reg u32 tmp; // ChaCha20 constants "expand 32-byte k" tmp = 0x61707865; state_ptr[0] = tmp; tmp = 0x3320646e; state_ptr[1] = tmp; tmp = 0x79622d32; state_ptr[2] = tmp; tmp = 0x6b206574; state_ptr[3] = tmp; // Load 32-byte key into state[4] through state[11] (8 words) in little-endian // Start at state[4] and sk_ptr[0] inline int word_idx; for word_idx = 4 to 12 { // Read 4 bytes in little-endian order reg u32 acc = 0; inline int byte_addr = (word_idx - 4) * 4; tmp = (32u)sk_ptr[byte_addr]; acc |= tmp; byte_addr += 1; tmp = (32u)sk_ptr[byte_addr]; acc |= (tmp << 8); byte_addr += 1; tmp = (32u)sk_ptr[byte_addr]; acc |= (tmp << 16); byte_addr += 1; tmp = (32u)sk_ptr[byte_addr]; acc |= (tmp << 24); state_ptr[word_idx] = acc; } return state_ptr; } fn chacha20_ietf_ivsetup(reg ptr u32[16] state_ptr, reg ptr u8[12] nonce_ptr) -> reg ptr u32[16]{ reg u32 tmp; tmp = 0; state_ptr[12] = tmp; // Load 12-byte nonce into state[13] through state[15] (3 words) in little-endian inline int word_idx; for word_idx = 13 to 16 { // Read 4 bytes in little-endian order reg u32 acc = 0; inline int byte_addr = (word_idx - 13) * 4; tmp = (32u)nonce_ptr[byte_addr]; acc |= tmp; byte_addr += 1; tmp = (32u)nonce_ptr[byte_addr]; acc |= (tmp << 8); byte_addr += 1; tmp = (32u)nonce_ptr[byte_addr]; acc |= (tmp << 16); byte_addr += 1; tmp = (32u)nonce_ptr[byte_addr]; acc |= (tmp << 24); state_ptr[word_idx] = acc; } return state_ptr; } fn chacha20_encrypt_bytes(reg ptr u32[16] state_ptr, reg u32 msg_ptr, reg u32 ct_ptr, reg u32 msg_len) -> reg ptr u32[16]{ // hack: We can only have 4 arguments so we operate in-place msg_ptr = ct_ptr; reg u32 i bound tmp tmp_b; i = 0; stack u8[64] block_bytes; reg ptr u8[64] pblock_bytes; pblock_bytes = block_bytes; while (i < msg_len){ // Generate one block of keystream pblock_bytes = chacha20_block(pblock_bytes, state_ptr); // XOR message with keystream and write output reg u32 ct_msg_ptr byte_offset block_size; ct_msg_ptr = msg_ptr; ct_msg_ptr += i; // Start at current block offset // Calculate how many bytes to process in this block byte_offset = 0; block_size = 64; bound = i + 64; if (bound > msg_len) { block_size = msg_len - i; // Handle partial last block } // XOR loop using simple pointer arithmetic while (byte_offset < block_size) { tmp = (32u)[:u8 ct_msg_ptr + byte_offset]; // Load msg byte tmp_b = (32u)pblock_bytes[byte_offset]; // Load keystream byte tmp ^= tmp_b; // XOR [:u8 ct_msg_ptr + byte_offset] = (8u)tmp; // Store result byte_offset += 1; } // Increment counter for next block tmp = state_ptr[12]; tmp += 1; state_ptr[12] = tmp; i += 64; } return state_ptr; } // Export wrapper for quarterround // dptr: pointer to 4 consecutive u32 values (a, b, c, d) export fn test_quarterround(reg u32 dptr) { reg u32 a b c d; a = [dptr + 0]; b = [dptr + 4]; c = [dptr + 8]; d = [dptr + 12]; a, b, c, d = quarterround(a, b, c, d); [dptr + 0] = a; [dptr + 4] = b; [dptr + 8] = c; [dptr + 12] = d; } // Export wrapper for column_round // state_ptr: pointer to 16-word (64-byte) state array export fn test_column_round(reg ptr u32[16] state_ptr) -> reg ptr u32[16] { state_ptr = state_ptr; state_ptr = column_round(state_ptr); return state_ptr; } // Export wrapper for diagonal_round // state_ptr: pointer to 16-word (64-byte) state array export fn test_diagonal_round(reg ptr u32[16] state_ptr) -> reg ptr u32[16] { state_ptr = state_ptr; state_ptr = diagonal_round(state_ptr); return state_ptr; } // Export wrapper for chacha20_block // output_ptr: pointer to 64-byte output buffer // input_ptr: pointer to 16-word (64-byte) input state array export fn test_chacha20_block(reg ptr u8[64] output_ptr, reg ptr u32[16] input_ptr) -> reg ptr u8[64]{ output_ptr = output_ptr; input_ptr = input_ptr; output_ptr = chacha20_block(output_ptr, input_ptr); return output_ptr; } // Export wrapper for chacha20_keysetup // state_ptr: pointer to 16-word (64-byte) state array // key_ptr: pointer to 32-byte key export fn test_chacha20_keysetup(reg ptr u32[16] state_ptr, reg ptr u8[32] key_ptr) -> reg ptr u32[16] { state_ptr = state_ptr; key_ptr = key_ptr; state_ptr = chacha20_keysetup(state_ptr, key_ptr); return state_ptr; } // Export wrapper for chacha20_ietf_ivsetup // state_ptr: pointer to 16-word (64-byte) state array // nonce_ptr: pointer to 12-byte nonce export fn test_chacha20_ietf_ivsetup(reg ptr u32[16] state_ptr, reg ptr u8[12] nonce_ptr) -> reg ptr u32[16] { state_ptr = state_ptr; nonce_ptr = nonce_ptr; state_ptr = chacha20_ietf_ivsetup(state_ptr, nonce_ptr); return state_ptr; } // Export wrapper for chacha20_encrypt_bytes // state_ptr: pointer to 16-word (64-byte) state array // msg_ptr: pointer to message (unused in current implementation) // ct_ptr: pointer to output ciphertext/keystream buffer // msg_len: length of message in bytes export fn test_chacha20_encrypt_bytes(reg ptr u32[16] state_ptr, reg u32 msg_ptr, reg u32 ct_ptr, reg u32 msg_len) -> reg ptr u32[16] { state_ptr = state_ptr; msg_ptr = msg_ptr; ct_ptr = ct_ptr; msg_len = msg_len; state_ptr = chacha20_encrypt_bytes(state_ptr, msg_ptr, ct_ptr, msg_len); return state_ptr; } // Export wrapper for crypto_stream_chacha20_ietf (highest-level API) // ct_ptr: pointer to output ciphertext/keystream buffer // nonce_ptr: pointer to 12-byte nonce // sk_ptr: pointer to 32-byte secret key // ct_len: length of output in bytes export fn crypto_stream_chacha20_ietf(reg u32 ct_ptr, reg ptr u8[12] nonce_ptr, reg ptr u8[32] sk_ptr, reg u32 ct_len){ ct_ptr = ct_ptr; nonce_ptr = nonce_ptr; sk_ptr = sk_ptr; ct_len = ct_len; stack u32[16] state; reg ptr u32[16] pstate; pstate = state; // Zerorize ct_ptr reg u32 i zero; zero = 0; i = 0; reg u32 bound; bound = 16; while (i < bound) { pstate[i] = zero; i += 1; } pstate = chacha20_keysetup(pstate, sk_ptr); pstate = chacha20_ietf_ivsetup(pstate, nonce_ptr); // i is just a dummy here pstate = chacha20_encrypt_bytes(pstate, i, ct_ptr, ct_len); }