diff --git a/aes/ctr.go b/aes/ctr.go
new file mode 100644
index 0000000..89e2305
--- /dev/null
+++ b/aes/ctr.go
@@ -0,0 +1,148 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+import (
+	//"crypto/internal/fips140"
+	"github.com/xtls/reality/alias"
+	"github.com/xtls/reality/subtle"
+	"github.com/xtls/reality/byteorder"
+	"math/bits"
+)
+
+type CTR struct {
+	b          Block
+	ivlo, ivhi uint64 // start counter as 64-bit limbs
+	offset     uint64 // for XORKeyStream only
+}
+
+func NewCTR(b *Block, iv []byte) *CTR {
+	// Allocate the CTR here, in an easily inlineable function, so
+	// the allocation can be done in the caller's stack frame
+	// instead of the heap.  See issue 70499.
+	c := newCTR(b, iv)
+	return &c
+}
+func newCTR(b *Block, iv []byte) CTR {
+	if len(iv) != BlockSize {
+		panic("bad IV length")
+	}
+
+	return CTR{
+		b:      *b,
+		ivlo:   byteorder.BEUint64(iv[8:16]),
+		ivhi:   byteorder.BEUint64(iv[0:8]),
+		offset: 0,
+	}
+}
+
+func (c *CTR) XORKeyStream(dst, src []byte) {
+	c.XORKeyStreamAt(dst, src, c.offset)
+
+	var carry uint64
+	c.offset, carry = bits.Add64(c.offset, uint64(len(src)), 0)
+	if carry != 0 {
+		panic("crypto/aes: counter overflow")
+	}
+}
+
+// RoundToBlock is used by CTR_DRBG, which discards the rightmost unused bits at
+// each request. It rounds the offset up to the next block boundary.
+func RoundToBlock(c *CTR) {
+	if remainder := c.offset % BlockSize; remainder != 0 {
+		var carry uint64
+		c.offset, carry = bits.Add64(c.offset, BlockSize-remainder, 0)
+		if carry != 0 {
+			panic("crypto/aes: counter overflow")
+		}
+	}
+}
+
+// XORKeyStreamAt behaves like XORKeyStream but keeps no state, and instead
+// seeks into the keystream by the given bytes offset from the start (ignoring
+// any XORKetStream calls). This allows for random access into the keystream, up
+// to 16 EiB from the start.
+func (c *CTR) XORKeyStreamAt(dst, src []byte, offset uint64) {
+	if len(dst) < len(src) {
+		panic("crypto/aes: len(dst) < len(src)")
+	}
+	dst = dst[:len(src)]
+	if alias.InexactOverlap(dst, src) {
+		panic("crypto/aes: invalid buffer overlap")
+	}
+	//fips140.RecordApproved()
+
+	ivlo, ivhi := add128(c.ivlo, c.ivhi, offset/BlockSize)
+
+	if blockOffset := offset % BlockSize; blockOffset != 0 {
+		// We have a partial block at the beginning.
+		var in, out [BlockSize]byte
+		copy(in[blockOffset:], src)
+		ctrBlocks1(&c.b, &out, &in, ivlo, ivhi)
+		n := copy(dst, out[blockOffset:])
+		src = src[n:]
+		dst = dst[n:]
+		ivlo, ivhi = add128(ivlo, ivhi, 1)
+	}
+
+	for len(src) >= 8*BlockSize {
+		ctrBlocks8(&c.b, (*[8 * BlockSize]byte)(dst), (*[8 * BlockSize]byte)(src), ivlo, ivhi)
+		src = src[8*BlockSize:]
+		dst = dst[8*BlockSize:]
+		ivlo, ivhi = add128(ivlo, ivhi, 8)
+	}
+
+	// The tail can have at most 7 = 4 + 2 + 1 blocks.
+	if len(src) >= 4*BlockSize {
+		ctrBlocks4(&c.b, (*[4 * BlockSize]byte)(dst), (*[4 * BlockSize]byte)(src), ivlo, ivhi)
+		src = src[4*BlockSize:]
+		dst = dst[4*BlockSize:]
+		ivlo, ivhi = add128(ivlo, ivhi, 4)
+	}
+	if len(src) >= 2*BlockSize {
+		ctrBlocks2(&c.b, (*[2 * BlockSize]byte)(dst), (*[2 * BlockSize]byte)(src), ivlo, ivhi)
+		src = src[2*BlockSize:]
+		dst = dst[2*BlockSize:]
+		ivlo, ivhi = add128(ivlo, ivhi, 2)
+	}
+	if len(src) >= 1*BlockSize {
+		ctrBlocks1(&c.b, (*[1 * BlockSize]byte)(dst), (*[1 * BlockSize]byte)(src), ivlo, ivhi)
+		src = src[1*BlockSize:]
+		dst = dst[1*BlockSize:]
+		ivlo, ivhi = add128(ivlo, ivhi, 1)
+	}
+
+	if len(src) != 0 {
+		// We have a partial block at the end.
+		var in, out [BlockSize]byte
+		copy(in[:], src)
+		ctrBlocks1(&c.b, &out, &in, ivlo, ivhi)
+		copy(dst, out[:])
+	}
+}
+
+// Each ctrBlocksN function XORs src with N blocks of counter keystream, and
+// stores it in dst. src is loaded in full before storing dst, so they can
+// overlap even inexactly. The starting counter value is passed in as a pair of
+// little-endian 64-bit integers.
+
+func ctrBlocks(b *Block, dst, src []byte, ivlo, ivhi uint64) {
+	buf := make([]byte, len(src), 8*BlockSize)
+	for i := 0; i < len(buf); i += BlockSize {
+		byteorder.BEPutUint64(buf[i:], ivhi)
+		byteorder.BEPutUint64(buf[i+8:], ivlo)
+		ivlo, ivhi = add128(ivlo, ivhi, 1)
+		encryptBlock(b, buf[i:], buf[i:])
+	}
+	// XOR into buf first, in case src and dst overlap (see above).
+	subtle.XORBytes(buf, src, buf)
+	copy(dst, buf)
+}
+
+func add128(lo, hi uint64, x uint64) (uint64, uint64) {
+	lo, c := bits.Add64(lo, x, 0)
+	hi, _ = bits.Add64(hi, 0, c)
+	return lo, hi
+}
\ No newline at end of file
diff --git a/aes/ctr_noasm.go b/aes/ctr_noasm.go
new file mode 100644
index 0000000..e93fc80
--- /dev/null
+++ b/aes/ctr_noasm.go
@@ -0,0 +1,21 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package aes
+
+func ctrBlocks1(b *Block, dst, src *[BlockSize]byte, ivlo, ivhi uint64) {
+	ctrBlocks(b, dst[:], src[:], ivlo, ivhi)
+}
+
+func ctrBlocks2(b *Block, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64) {
+	ctrBlocks(b, dst[:], src[:], ivlo, ivhi)
+}
+
+func ctrBlocks4(b *Block, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64) {
+	ctrBlocks(b, dst[:], src[:], ivlo, ivhi)
+}
+
+func ctrBlocks8(b *Block, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64) {
+	ctrBlocks(b, dst[:], src[:], ivlo, ivhi)
+}
\ No newline at end of file
diff --git a/drbg/ctrdrbg.go b/drbg/ctrdrbg.go
new file mode 100644
index 0000000..9d4257e
--- /dev/null
+++ b/drbg/ctrdrbg.go
@@ -0,0 +1,143 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package drbg
+
+import (
+	//"crypto/internal/fips140"
+	"github.com/xtls/reality/aes"
+	"github.com/xtls/reality/subtle"
+	"github.com/xtls/reality/byteorder"
+	"math/bits"
+)
+
+// Counter is an SP 800-90A Rev. 1 CTR_DRBG instantiated with AES-256.
+//
+// Per Table 3, it has a security strength of 256 bits, a seed size of 384 bits,
+// a counter length of 128 bits, a reseed interval of 2^48 requests, and a
+// maximum request size of 2^19 bits (2^16 bytes, 64 KiB).
+//
+// We support a narrow range of parameters that fit the needs of our RNG:
+// AES-256, no derivation function, no personalization string, no prediction
+// resistance, and 384-bit additional input.
+//
+// WARNING: this type provides tightly scoped support for the DRBG
+// functionality we need for FIPS 140-3 _only_. This type _should not_ be used
+// outside of the FIPS 140-3 module for any other use.
+//
+// In particular, as documented, Counter does not support the derivation
+// function, or personalization strings which are necessary for safely using
+// this DRBG for generic purposes without leaking sensitive values.
+type Counter struct {
+	// c is instantiated with K as the key and V as the counter.
+	c aes.CTR
+
+	reseedCounter uint64
+}
+
+const (
+	keySize        = 256 / 8
+	SeedSize       = keySize + aes.BlockSize
+	reseedInterval = 1 << 48
+	maxRequestSize = (1 << 19) / 8
+)
+
+func NewCounter(entropy *[SeedSize]byte) *Counter {
+	// CTR_DRBG_Instantiate_algorithm, per Section 10.2.1.3.1.
+	//fips140.RecordApproved()
+
+	K := make([]byte, keySize)
+	V := make([]byte, aes.BlockSize)
+
+	// V starts at 0, but is incremented in CTR_DRBG_Update before each use,
+	// unlike AES-CTR where it is incremented after each use.
+	V[len(V)-1] = 1
+
+	cipher, err := aes.New(K)
+	if err != nil {
+		panic(err)
+	}
+
+	c := &Counter{}
+	c.c = *aes.NewCTR(cipher, V)
+	c.update(entropy)
+	c.reseedCounter = 1
+	return c
+}
+
+func (c *Counter) update(seed *[SeedSize]byte) {
+	// CTR_DRBG_Update, per Section 10.2.1.2.
+
+	temp := make([]byte, SeedSize)
+	c.c.XORKeyStream(temp, seed[:])
+	K := temp[:keySize]
+	V := temp[keySize:]
+
+	// Again, we pre-increment V, like in NewCounter.
+	increment((*[aes.BlockSize]byte)(V))
+
+	cipher, err := aes.New(K)
+	if err != nil {
+		panic(err)
+	}
+	c.c = *aes.NewCTR(cipher, V)
+}
+
+func increment(v *[aes.BlockSize]byte) {
+	hi := byteorder.BEUint64(v[:8])
+	lo := byteorder.BEUint64(v[8:])
+	lo, c := bits.Add64(lo, 1, 0)
+	hi, _ = bits.Add64(hi, 0, c)
+	byteorder.BEPutUint64(v[:8], hi)
+	byteorder.BEPutUint64(v[8:], lo)
+}
+
+func (c *Counter) Reseed(entropy, additionalInput *[SeedSize]byte) {
+	// CTR_DRBG_Reseed_algorithm, per Section 10.2.1.4.1.
+	//fips140.RecordApproved()
+
+	var seed [SeedSize]byte
+	subtle.XORBytes(seed[:], entropy[:], additionalInput[:])
+	c.update(&seed)
+	c.reseedCounter = 1
+}
+
+// Generate produces at most maxRequestSize bytes of random data in out.
+func (c *Counter) Generate(out []byte, additionalInput *[SeedSize]byte) (reseedRequired bool) {
+	// CTR_DRBG_Generate_algorithm, per Section 10.2.1.5.1.
+	//fips140.RecordApproved()
+
+	if len(out) > maxRequestSize {
+		panic("crypto/drbg: internal error: request size exceeds maximum")
+	}
+
+	// Step 1.
+	if c.reseedCounter > reseedInterval {
+		return true
+	}
+
+	// Step 2.
+	if additionalInput != nil {
+		c.update(additionalInput)
+	} else {
+		// If the additional input is null, the first CTR_DRBG_Update is
+		// skipped, but the additional input is replaced with an all-zero string
+		// for the second CTR_DRBG_Update.
+		additionalInput = new([SeedSize]byte)
+	}
+
+	// Steps 3-5.
+	clear(out)
+	c.c.XORKeyStream(out, out)
+	aes.RoundToBlock(&c.c)
+
+	// Step 6.
+	c.update(additionalInput)
+
+	// Step 7.
+	c.reseedCounter++
+
+	// Step 8.
+	return false
+}
\ No newline at end of file
diff --git a/drbg/rand.go b/drbg/rand.go
new file mode 100644
index 0000000..7fe0b57
--- /dev/null
+++ b/drbg/rand.go
@@ -0,0 +1,102 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package drbg provides cryptographically secure random bytes
+// usable by FIPS code. In FIPS mode it uses an SP 800-90A Rev. 1
+// Deterministic Random Bit Generator (DRBG). Otherwise,
+// it uses the operating system's random number generator.
+package drbg
+
+import (
+	"github.com/xtls/reality/entropy"
+	// "crypto/internal/fips140"
+	"github.com/xtls/reality/randutil"
+	// "github.com/xtls/reality/sysrand"
+	
+	"crypto/rand"
+	"io"
+	"sync"
+)
+
+var drbgs = sync.Pool{
+	New: func() any {
+		var c *Counter
+		entropy.Depleted(func(seed *[48]byte) {
+			c = NewCounter(seed)
+		})
+		return c
+	},
+}
+
+// Read fills b with cryptographically secure random bytes. In FIPS mode, it
+// uses an SP 800-90A Rev. 1 Deterministic Random Bit Generator (DRBG).
+// Otherwise, it uses the operating system's random number generator.
+func Read(b []byte) {
+	// if !fips140.Enabled {
+	// 	rand.Read(b)
+	// 	return
+	// }
+
+	// At every read, 128 random bits from the operating system are mixed as
+	// additional input, to make the output as strong as non-FIPS randomness.
+	// This is not credited as entropy for FIPS purposes, as allowed by Section
+	// 8.7.2: "Note that a DRBG does not rely on additional input to provide
+	// entropy, even though entropy could be provided in the additional input".
+	additionalInput := new([SeedSize]byte)
+	rand.Read(additionalInput[:16])
+
+	drbg := drbgs.Get().(*Counter)
+	defer drbgs.Put(drbg)
+
+	for len(b) > 0 {
+		size := min(len(b), maxRequestSize)
+		if reseedRequired := drbg.Generate(b[:size], additionalInput); reseedRequired {
+			// See SP 800-90A Rev. 1, Section 9.3.1, Steps 6-8, as explained in
+			// Section 9.3.2: if Generate reports a reseed is required, the
+			// additional input is passed to Reseed along with the entropy and
+			// then nulled before the next Generate call.
+			entropy.Depleted(func(seed *[48]byte) {
+				drbg.Reseed(seed, additionalInput)
+			})
+			additionalInput = nil
+			continue
+		}
+		b = b[size:]
+	}
+}
+
+// DefaultReader is a sentinel type, embedded in the default
+// [crypto/rand.Reader], used to recognize it when passed to
+// APIs that accept a rand io.Reader.
+type DefaultReader interface{ defaultReader() }
+
+// ReadWithReader uses Reader to fill b with cryptographically secure random
+// bytes. It is intended for use in APIs that expose a rand io.Reader.
+//
+// If Reader is not the default Reader from crypto/rand,
+// [randutil.MaybeReadByte] and [fips140.RecordNonApproved] are called.
+func ReadWithReader(r io.Reader, b []byte) error {
+	if _, ok := r.(DefaultReader); ok {
+		Read(b)
+		return nil
+	}
+
+	//fips140.RecordNonApproved()
+	randutil.MaybeReadByte(r)
+	_, err := io.ReadFull(r, b)
+	return err
+}
+
+// ReadWithReaderDeterministic is like ReadWithReader, but it doesn't call
+// [randutil.MaybeReadByte] on non-default Readers.
+func ReadWithReaderDeterministic(r io.Reader, b []byte) error {
+	if _, ok := r.(DefaultReader); ok {
+		Read(b)
+		return nil
+	}
+
+	//fips140.RecordNonApproved()
+	_, err := io.ReadFull(r, b)
+	return err
+}
\ No newline at end of file
diff --git a/entropy/entropy.go b/entropy/entropy.go
new file mode 100644
index 0000000..0bea141
--- /dev/null
+++ b/entropy/entropy.go
@@ -0,0 +1,29 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package entropy provides the passive entropy source for the FIPS 140-3
+// module. It is only used in FIPS mode by [crypto/internal/fips140/drbg.Read].
+//
+// This complies with IG 9.3.A, Additional Comment 12, which until January 1,
+// 2026 allows new modules to meet an [earlier version] of Resolution 2(b):
+// "A software module that contains an approved DRBG that receives a LOAD
+// command (or its logical equivalent) with entropy obtained from [...] inside
+// the physical perimeter of the operational environment of the module [...]."
+//
+// Distributions that have their own SP 800-90B entropy source should replace
+// this package with their own implementation.
+//
+// [earlier version]: https://csrc.nist.gov/CSRC/media/Projects/cryptographic-module-validation-program/documents/IG%209.3.A%20Resolution%202b%5BMarch%2026%202024%5D.pdf
+package entropy
+
+// "github.com/xtls/reality/sysrand"
+import "crypto/rand"
+
+// Depleted notifies the entropy source that the entropy in the module is
+// "depleted" and provides the callback for the LOAD command.
+func Depleted(LOAD func(*[48]byte)) {
+	var entropy [48]byte
+	rand.Read(entropy[:])
+	LOAD(&entropy)
+}
\ No newline at end of file
diff --git a/handshake_client.go b/handshake_client.go
index f3a0738..9cd987a 100644
--- a/handshake_client.go
+++ b/handshake_client.go
@@ -23,7 +23,7 @@ import (
 	"time"
 
 	"github.com/xtls/reality/hpke"
-	"github.com/xtls/reality/mlkem768"
+	"github.com/xtls/reality/mlkem"
 	"github.com/xtls/reality/tls13"
 )
 
@@ -160,11 +160,11 @@ func (c *Conn) makeClientHello() (*clientHelloMsg, *keySharePrivateKeys, *echCon
 			if err != nil {
 				return nil, nil, nil, err
 			}
-			seed := make([]byte, mlkem768.SeedSize)
+			seed := make([]byte, mlkem.SeedSize)
 			if _, err := io.ReadFull(config.rand(), seed); err != nil {
 				return nil, nil, nil, err
 			}
-			keyShareKeys.kyber, err = mlkem768.NewDecapsulationKey768(seed)
+			keyShareKeys.kyber, err = mlkem.NewDecapsulationKey768(seed)
 			if err != nil {
 				return nil, nil, nil, err
 			}
diff --git a/handshake_client_tls13.go b/handshake_client_tls13.go
index 62aee55..f23bc8f 100644
--- a/handshake_client_tls13.go
+++ b/handshake_client_tls13.go
@@ -16,7 +16,7 @@ import (
 	"slices"
 	"time"
 
-	"github.com/xtls/reality/mlkem768"
+	"github.com/xtls/reality/mlkem"
 	"github.com/xtls/reality/tls13"
 	"golang.org/x/crypto/hkdf"
 )
@@ -481,7 +481,7 @@ func (hs *clientHandshakeStateTLS13) establishHandshakeKeys() error {
 
 	ecdhePeerData := hs.serverHello.serverShare.data
 	if hs.serverHello.serverShare.group == x25519Kyber768Draft00 {
-		if len(ecdhePeerData) != x25519PublicKeySize+mlkem768.CiphertextSize768 {
+		if len(ecdhePeerData) != x25519PublicKeySize+mlkem.CiphertextSize768 {
 			c.sendAlert(alertIllegalParameter)
 			return errors.New("tls: invalid server key share")
 		}
diff --git a/handshake_server_tls13.go b/handshake_server_tls13.go
index abdb8ae..1dce1c7 100644
--- a/handshake_server_tls13.go
+++ b/handshake_server_tls13.go
@@ -22,7 +22,7 @@ import (
 	"slices"
 	"time"
 
-	"github.com/xtls/reality/mlkem768"
+	"github.com/xtls/reality/mlkem"
 	"github.com/xtls/reality/tls13"
 )
 
@@ -276,7 +276,7 @@ func (hs *serverHandshakeStateTLS13) processClientHello() error {
 	ecdhData := clientKeyShare.data
 	if selectedGroup == x25519Kyber768Draft00 {
 		ecdhGroup = X25519
-		if len(ecdhData) != x25519PublicKeySize+mlkem768.EncapsulationKeySize768 {
+		if len(ecdhData) != x25519PublicKeySize+mlkem.EncapsulationKeySize768 {
 			c.sendAlert(alertIllegalParameter)
 			return errors.New("tls: invalid Kyber client key share")
 		}
diff --git a/key_schedule.go b/key_schedule.go
index 8d96223..7113751 100644
--- a/key_schedule.go
+++ b/key_schedule.go
@@ -11,9 +11,8 @@ import (
 	"hash"
 	"io"
 
-	"golang.org/x/crypto/sha3"
-
-	"github.com/xtls/reality/mlkem768"
+	"github.com/xtls/reality/mlkem"
+	"github.com/xtls/reality/sha3"
 	"github.com/xtls/reality/tls13"
 )
 
@@ -55,11 +54,11 @@ func (c *cipherSuiteTLS13) exportKeyingMaterial(s *tls13.MasterSecret, transcrip
 type keySharePrivateKeys struct {
 	curveID CurveID
 	ecdhe   *ecdh.PrivateKey
-	kyber   *mlkem768.DecapsulationKey768
+	kyber   *mlkem.DecapsulationKey768
 }
 
 // kyberDecapsulate implements decapsulation according to Kyber Round 3.
-func kyberDecapsulate(dk *mlkem768.DecapsulationKey768, c []byte) ([]byte, error) {
+func kyberDecapsulate(dk *mlkem.DecapsulationKey768, c []byte) ([]byte, error) {
 	K, err := dk.Decapsulate(c)
 	if err != nil {
 		return nil, err
@@ -69,7 +68,7 @@ func kyberDecapsulate(dk *mlkem768.DecapsulationKey768, c []byte) ([]byte, error
 
 // kyberEncapsulate implements encapsulation according to Kyber Round 3.
 func kyberEncapsulate(ek []byte) (c, ss []byte, err error) {
-	k, err := mlkem768.NewEncapsulationKey768(ek)
+	k, err := mlkem.NewEncapsulationKey768(ek)
 	if err != nil {
 		return nil, nil, err
 	}
@@ -78,13 +77,14 @@ func kyberEncapsulate(ek []byte) (c, ss []byte, err error) {
 }
 
 func kyberSharedSecret(c, K []byte) []byte {
-	// Package mlkem768 implements ML-KEM, which compared to Kyber removed a
+	// Package mlkem implements ML-KEM, which compared to Kyber removed a
 	// final hashing step. Compute SHAKE-256(K || SHA3-256(c), 32) to match Kyber.
 	// See https://words.filippo.io/mlkem768/#bonus-track-using-a-ml-kem-implementation-as-kyber-v3.
 	h := sha3.NewShake256()
 	h.Write(K)
-	ch := sha3.Sum256(c)
-	h.Write(ch[:])
+	ch := sha3.New256()
+	ch.Write(c)
+	h.Write(ch.Sum(nil))
 	out := make([]byte, 32)
 	h.Read(out)
 	return out
diff --git a/mlkem/field.go b/mlkem/field.go
new file mode 100644
index 0000000..4f94ea9
--- /dev/null
+++ b/mlkem/field.go
@@ -0,0 +1,550 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mlkem
+
+import (
+	"github.com/xtls/reality/sha3"
+	"github.com/xtls/reality/byteorder"
+	"errors"
+)
+
+// fieldElement is an integer modulo q, an element of ℤ_q. It is always reduced.
+type fieldElement uint16
+
+// fieldCheckReduced checks that a value a is < q.
+func fieldCheckReduced(a uint16) (fieldElement, error) {
+	if a >= q {
+		return 0, errors.New("unreduced field element")
+	}
+	return fieldElement(a), nil
+}
+
+// fieldReduceOnce reduces a value a < 2q.
+func fieldReduceOnce(a uint16) fieldElement {
+	x := a - q
+	// If x underflowed, then x >= 2¹⁶ - q > 2¹⁵, so the top bit is set.
+	x += (x >> 15) * q
+	return fieldElement(x)
+}
+
+func fieldAdd(a, b fieldElement) fieldElement {
+	x := uint16(a + b)
+	return fieldReduceOnce(x)
+}
+
+func fieldSub(a, b fieldElement) fieldElement {
+	x := uint16(a - b + q)
+	return fieldReduceOnce(x)
+}
+
+const (
+	barrettMultiplier = 5039 // 2¹² * 2¹² / q
+	barrettShift      = 24   // log₂(2¹² * 2¹²)
+)
+
+// fieldReduce reduces a value a < 2q² using Barrett reduction, to avoid
+// potentially variable-time division.
+func fieldReduce(a uint32) fieldElement {
+	quotient := uint32((uint64(a) * barrettMultiplier) >> barrettShift)
+	return fieldReduceOnce(uint16(a - quotient*q))
+}
+
+func fieldMul(a, b fieldElement) fieldElement {
+	x := uint32(a) * uint32(b)
+	return fieldReduce(x)
+}
+
+// fieldMulSub returns a * (b - c). This operation is fused to save a
+// fieldReduceOnce after the subtraction.
+func fieldMulSub(a, b, c fieldElement) fieldElement {
+	x := uint32(a) * uint32(b-c+q)
+	return fieldReduce(x)
+}
+
+// fieldAddMul returns a * b + c * d. This operation is fused to save a
+// fieldReduceOnce and a fieldReduce.
+func fieldAddMul(a, b, c, d fieldElement) fieldElement {
+	x := uint32(a) * uint32(b)
+	x += uint32(c) * uint32(d)
+	return fieldReduce(x)
+}
+
+// compress maps a field element uniformly to the range 0 to 2ᵈ-1, according to
+// FIPS 203, Definition 4.7.
+func compress(x fieldElement, d uint8) uint16 {
+	// We want to compute (x * 2ᵈ) / q, rounded to nearest integer, with 1/2
+	// rounding up (see FIPS 203, Section 2.3).
+
+	// Barrett reduction produces a quotient and a remainder in the range [0, 2q),
+	// such that dividend = quotient * q + remainder.
+	dividend := uint32(x) << d // x * 2ᵈ
+	quotient := uint32(uint64(dividend) * barrettMultiplier >> barrettShift)
+	remainder := dividend - quotient*q
+
+	// Since the remainder is in the range [0, 2q), not [0, q), we need to
+	// portion it into three spans for rounding.
+	//
+	//     [ 0,       q/2     ) -> round to 0
+	//     [ q/2,     q + q/2 ) -> round to 1
+	//     [ q + q/2, 2q      ) -> round to 2
+	//
+	// We can convert that to the following logic: add 1 if remainder > q/2,
+	// then add 1 again if remainder > q + q/2.
+	//
+	// Note that if remainder > x, then ⌊x⌋ - remainder underflows, and the top
+	// bit of the difference will be set.
+	quotient += (q/2 - remainder) >> 31 & 1
+	quotient += (q + q/2 - remainder) >> 31 & 1
+
+	// quotient might have overflowed at this point, so reduce it by masking.
+	var mask uint32 = (1 << d) - 1
+	return uint16(quotient & mask)
+}
+
+// decompress maps a number x between 0 and 2ᵈ-1 uniformly to the full range of
+// field elements, according to FIPS 203, Definition 4.8.
+func decompress(y uint16, d uint8) fieldElement {
+	// We want to compute (y * q) / 2ᵈ, rounded to nearest integer, with 1/2
+	// rounding up (see FIPS 203, Section 2.3).
+
+	dividend := uint32(y) * q
+	quotient := dividend >> d // (y * q) / 2ᵈ
+
+	// The d'th least-significant bit of the dividend (the most significant bit
+	// of the remainder) is 1 for the top half of the values that divide to the
+	// same quotient, which are the ones that round up.
+	quotient += dividend >> (d - 1) & 1
+
+	// quotient is at most (2¹¹-1) * q / 2¹¹ + 1 = 3328, so it didn't overflow.
+	return fieldElement(quotient)
+}
+
+// ringElement is a polynomial, an element of R_q, represented as an array
+// according to FIPS 203, Section 2.4.4.
+type ringElement [n]fieldElement
+
+// polyAdd adds two ringElements or nttElements.
+func polyAdd[T ~[n]fieldElement](a, b T) (s T) {
+	for i := range s {
+		s[i] = fieldAdd(a[i], b[i])
+	}
+	return s
+}
+
+// polySub subtracts two ringElements or nttElements.
+func polySub[T ~[n]fieldElement](a, b T) (s T) {
+	for i := range s {
+		s[i] = fieldSub(a[i], b[i])
+	}
+	return s
+}
+
+// polyByteEncode appends the 384-byte encoding of f to b.
+//
+// It implements ByteEncode₁₂, according to FIPS 203, Algorithm 5.
+func polyByteEncode[T ~[n]fieldElement](b []byte, f T) []byte {
+	out, B := sliceForAppend(b, encodingSize12)
+	for i := 0; i < n; i += 2 {
+		x := uint32(f[i]) | uint32(f[i+1])<<12
+		B[0] = uint8(x)
+		B[1] = uint8(x >> 8)
+		B[2] = uint8(x >> 16)
+		B = B[3:]
+	}
+	return out
+}
+
+// polyByteDecode decodes the 384-byte encoding of a polynomial, checking that
+// all the coefficients are properly reduced. This fulfills the "Modulus check"
+// step of ML-KEM Encapsulation.
+//
+// It implements ByteDecode₁₂, according to FIPS 203, Algorithm 6.
+func polyByteDecode[T ~[n]fieldElement](b []byte) (T, error) {
+	if len(b) != encodingSize12 {
+		return T{}, errors.New("mlkem: invalid encoding length")
+	}
+	var f T
+	for i := 0; i < n; i += 2 {
+		d := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16
+		const mask12 = 0b1111_1111_1111
+		var err error
+		if f[i], err = fieldCheckReduced(uint16(d & mask12)); err != nil {
+			return T{}, errors.New("mlkem: invalid polynomial encoding")
+		}
+		if f[i+1], err = fieldCheckReduced(uint16(d >> 12)); err != nil {
+			return T{}, errors.New("mlkem: invalid polynomial encoding")
+		}
+		b = b[3:]
+	}
+	return f, nil
+}
+
+// sliceForAppend takes a slice and a requested number of bytes. It returns a
+// slice with the contents of the given slice followed by that many bytes and a
+// second slice that aliases into it and contains only the extra bytes. If the
+// original slice has sufficient capacity then no allocation is performed.
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+	if total := len(in) + n; cap(in) >= total {
+		head = in[:total]
+	} else {
+		head = make([]byte, total)
+		copy(head, in)
+	}
+	tail = head[len(in):]
+	return
+}
+
+// ringCompressAndEncode1 appends a 32-byte encoding of a ring element to s,
+// compressing one coefficients per bit.
+//
+// It implements Compress₁, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode₁, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode1(s []byte, f ringElement) []byte {
+	s, b := sliceForAppend(s, encodingSize1)
+	for i := range b {
+		b[i] = 0
+	}
+	for i := range f {
+		b[i/8] |= uint8(compress(f[i], 1) << (i % 8))
+	}
+	return s
+}
+
+// ringDecodeAndDecompress1 decodes a 32-byte slice to a ring element where each
+// bit is mapped to 0 or ⌈q/2⌋.
+//
+// It implements ByteDecode₁, according to FIPS 203, Algorithm 6,
+// followed by Decompress₁, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress1(b *[encodingSize1]byte) ringElement {
+	var f ringElement
+	for i := range f {
+		b_i := b[i/8] >> (i % 8) & 1
+		const halfQ = (q + 1) / 2        // ⌈q/2⌋, rounded up per FIPS 203, Section 2.3
+		f[i] = fieldElement(b_i) * halfQ // 0 decompresses to 0, and 1 to ⌈q/2⌋
+	}
+	return f
+}
+
+// ringCompressAndEncode4 appends a 128-byte encoding of a ring element to s,
+// compressing two coefficients per byte.
+//
+// It implements Compress₄, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode₄, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode4(s []byte, f ringElement) []byte {
+	s, b := sliceForAppend(s, encodingSize4)
+	for i := 0; i < n; i += 2 {
+		b[i/2] = uint8(compress(f[i], 4) | compress(f[i+1], 4)<<4)
+	}
+	return s
+}
+
+// ringDecodeAndDecompress4 decodes a 128-byte encoding of a ring element where
+// each four bits are mapped to an equidistant distribution.
+//
+// It implements ByteDecode₄, according to FIPS 203, Algorithm 6,
+// followed by Decompress₄, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress4(b *[encodingSize4]byte) ringElement {
+	var f ringElement
+	for i := 0; i < n; i += 2 {
+		f[i] = fieldElement(decompress(uint16(b[i/2]&0b1111), 4))
+		f[i+1] = fieldElement(decompress(uint16(b[i/2]>>4), 4))
+	}
+	return f
+}
+
+// ringCompressAndEncode10 appends a 320-byte encoding of a ring element to s,
+// compressing four coefficients per five bytes.
+//
+// It implements Compress₁₀, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode₁₀, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode10(s []byte, f ringElement) []byte {
+	s, b := sliceForAppend(s, encodingSize10)
+	for i := 0; i < n; i += 4 {
+		var x uint64
+		x |= uint64(compress(f[i], 10))
+		x |= uint64(compress(f[i+1], 10)) << 10
+		x |= uint64(compress(f[i+2], 10)) << 20
+		x |= uint64(compress(f[i+3], 10)) << 30
+		b[0] = uint8(x)
+		b[1] = uint8(x >> 8)
+		b[2] = uint8(x >> 16)
+		b[3] = uint8(x >> 24)
+		b[4] = uint8(x >> 32)
+		b = b[5:]
+	}
+	return s
+}
+
+// ringDecodeAndDecompress10 decodes a 320-byte encoding of a ring element where
+// each ten bits are mapped to an equidistant distribution.
+//
+// It implements ByteDecode₁₀, according to FIPS 203, Algorithm 6,
+// followed by Decompress₁₀, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress10(bb *[encodingSize10]byte) ringElement {
+	b := bb[:]
+	var f ringElement
+	for i := 0; i < n; i += 4 {
+		x := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32
+		b = b[5:]
+		f[i] = fieldElement(decompress(uint16(x>>0&0b11_1111_1111), 10))
+		f[i+1] = fieldElement(decompress(uint16(x>>10&0b11_1111_1111), 10))
+		f[i+2] = fieldElement(decompress(uint16(x>>20&0b11_1111_1111), 10))
+		f[i+3] = fieldElement(decompress(uint16(x>>30&0b11_1111_1111), 10))
+	}
+	return f
+}
+
+// ringCompressAndEncode appends an encoding of a ring element to s,
+// compressing each coefficient to d bits.
+//
+// It implements Compress, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode(s []byte, f ringElement, d uint8) []byte {
+	var b byte
+	var bIdx uint8
+	for i := 0; i < n; i++ {
+		c := compress(f[i], d)
+		var cIdx uint8
+		for cIdx < d {
+			b |= byte(c>>cIdx) << bIdx
+			bits := min(8-bIdx, d-cIdx)
+			bIdx += bits
+			cIdx += bits
+			if bIdx == 8 {
+				s = append(s, b)
+				b = 0
+				bIdx = 0
+			}
+		}
+	}
+	if bIdx != 0 {
+		panic("mlkem: internal error: bitsFilled != 0")
+	}
+	return s
+}
+
+// ringDecodeAndDecompress decodes an encoding of a ring element where
+// each d bits are mapped to an equidistant distribution.
+//
+// It implements ByteDecode, according to FIPS 203, Algorithm 6,
+// followed by Decompress, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress(b []byte, d uint8) ringElement {
+	var f ringElement
+	var bIdx uint8
+	for i := 0; i < n; i++ {
+		var c uint16
+		var cIdx uint8
+		for cIdx < d {
+			c |= uint16(b[0]>>bIdx) << cIdx
+			c &= (1 << d) - 1
+			bits := min(8-bIdx, d-cIdx)
+			bIdx += bits
+			cIdx += bits
+			if bIdx == 8 {
+				b = b[1:]
+				bIdx = 0
+			}
+		}
+		f[i] = fieldElement(decompress(c, d))
+	}
+	if len(b) != 0 {
+		panic("mlkem: internal error: leftover bytes")
+	}
+	return f
+}
+
+// ringCompressAndEncode5 appends a 160-byte encoding of a ring element to s,
+// compressing eight coefficients per five bytes.
+//
+// It implements Compress₅, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode₅, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode5(s []byte, f ringElement) []byte {
+	return ringCompressAndEncode(s, f, 5)
+}
+
+// ringDecodeAndDecompress5 decodes a 160-byte encoding of a ring element where
+// each five bits are mapped to an equidistant distribution.
+//
+// It implements ByteDecode₅, according to FIPS 203, Algorithm 6,
+// followed by Decompress₅, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress5(bb *[encodingSize5]byte) ringElement {
+	return ringDecodeAndDecompress(bb[:], 5)
+}
+
+// ringCompressAndEncode11 appends a 352-byte encoding of a ring element to s,
+// compressing eight coefficients per eleven bytes.
+//
+// It implements Compress₁₁, according to FIPS 203, Definition 4.7,
+// followed by ByteEncode₁₁, according to FIPS 203, Algorithm 5.
+func ringCompressAndEncode11(s []byte, f ringElement) []byte {
+	return ringCompressAndEncode(s, f, 11)
+}
+
+// ringDecodeAndDecompress11 decodes a 352-byte encoding of a ring element where
+// each eleven bits are mapped to an equidistant distribution.
+//
+// It implements ByteDecode₁₁, according to FIPS 203, Algorithm 6,
+// followed by Decompress₁₁, according to FIPS 203, Definition 4.8.
+func ringDecodeAndDecompress11(bb *[encodingSize11]byte) ringElement {
+	return ringDecodeAndDecompress(bb[:], 11)
+}
+
+// samplePolyCBD draws a ringElement from the special Dη distribution given a
+// stream of random bytes generated by the PRF function, according to FIPS 203,
+// Algorithm 8 and Definition 4.3.
+func samplePolyCBD(s []byte, b byte) ringElement {
+	prf := sha3.NewShake256()
+	prf.Write(s)
+	prf.Write([]byte{b})
+	B := make([]byte, 64*2) // η = 2
+	prf.Read(B)
+
+	// SamplePolyCBD simply draws four (2η) bits for each coefficient, and adds
+	// the first two and subtracts the last two.
+
+	var f ringElement
+	for i := 0; i < n; i += 2 {
+		b := B[i/2]
+		b_7, b_6, b_5, b_4 := b>>7, b>>6&1, b>>5&1, b>>4&1
+		b_3, b_2, b_1, b_0 := b>>3&1, b>>2&1, b>>1&1, b&1
+		f[i] = fieldSub(fieldElement(b_0+b_1), fieldElement(b_2+b_3))
+		f[i+1] = fieldSub(fieldElement(b_4+b_5), fieldElement(b_6+b_7))
+	}
+	return f
+}
+
+// nttElement is an NTT representation, an element of T_q, represented as an
+// array according to FIPS 203, Section 2.4.4.
+type nttElement [n]fieldElement
+
+// gammas are the values ζ^2BitRev7(i)+1 mod q for each index i, according to
+// FIPS 203, Appendix A (with negative values reduced to positive).
+var gammas = [128]fieldElement{17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}
+
+// nttMul multiplies two nttElements.
+//
+// It implements MultiplyNTTs, according to FIPS 203, Algorithm 11.
+func nttMul(f, g nttElement) nttElement {
+	var h nttElement
+	// We use i += 2 for bounds check elimination. See https://go.dev/issue/66826.
+	for i := 0; i < 256; i += 2 {
+		a0, a1 := f[i], f[i+1]
+		b0, b1 := g[i], g[i+1]
+		h[i] = fieldAddMul(a0, b0, fieldMul(a1, b1), gammas[i/2])
+		h[i+1] = fieldAddMul(a0, b1, a1, b0)
+	}
+	return h
+}
+
+// zetas are the values ζ^BitRev7(k) mod q for each index k, according to FIPS
+// 203, Appendix A.
+var zetas = [128]fieldElement{1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}
+
+// ntt maps a ringElement to its nttElement representation.
+//
+// It implements NTT, according to FIPS 203, Algorithm 9.
+func ntt(f ringElement) nttElement {
+	k := 1
+	for len := 128; len >= 2; len /= 2 {
+		for start := 0; start < 256; start += 2 * len {
+			zeta := zetas[k]
+			k++
+			// Bounds check elimination hint.
+			f, flen := f[start:start+len], f[start+len:start+len+len]
+			for j := 0; j < len; j++ {
+				t := fieldMul(zeta, flen[j])
+				flen[j] = fieldSub(f[j], t)
+				f[j] = fieldAdd(f[j], t)
+			}
+		}
+	}
+	return nttElement(f)
+}
+
+// inverseNTT maps a nttElement back to the ringElement it represents.
+//
+// It implements NTT⁻¹, according to FIPS 203, Algorithm 10.
+func inverseNTT(f nttElement) ringElement {
+	k := 127
+	for len := 2; len <= 128; len *= 2 {
+		for start := 0; start < 256; start += 2 * len {
+			zeta := zetas[k]
+			k--
+			// Bounds check elimination hint.
+			f, flen := f[start:start+len], f[start+len:start+len+len]
+			for j := 0; j < len; j++ {
+				t := f[j]
+				f[j] = fieldAdd(t, flen[j])
+				flen[j] = fieldMulSub(zeta, flen[j], t)
+			}
+		}
+	}
+	for i := range f {
+		f[i] = fieldMul(f[i], 3303) // 3303 = 128⁻¹ mod q
+	}
+	return ringElement(f)
+}
+
+// sampleNTT draws a uniformly random nttElement from a stream of uniformly
+// random bytes generated by the XOF function, according to FIPS 203,
+// Algorithm 7.
+func sampleNTT(rho []byte, ii, jj byte) nttElement {
+	B := sha3.NewShake128()
+	B.Write(rho)
+	B.Write([]byte{ii, jj})
+
+	// SampleNTT essentially draws 12 bits at a time from r, interprets them in
+	// little-endian, and rejects values higher than q, until it drew 256
+	// values. (The rejection rate is approximately 19%.)
+	//
+	// To do this from a bytes stream, it draws three bytes at a time, and
+	// splits them into two uint16 appropriately masked.
+	//
+	//               r₀              r₁              r₂
+	//       |- - - - - - - -|- - - - - - - -|- - - - - - - -|
+	//
+	//               Uint16(r₀ || r₁)
+	//       |- - - - - - - - - - - - - - - -|
+	//       |- - - - - - - - - - - -|
+	//                   d₁
+	//
+	//                                Uint16(r₁ || r₂)
+	//                       |- - - - - - - - - - - - - - - -|
+	//                               |- - - - - - - - - - - -|
+	//                                           d₂
+	//
+	// Note that in little-endian, the rightmost bits are the most significant
+	// bits (dropped with a mask) and the leftmost bits are the least
+	// significant bits (dropped with a right shift).
+
+	var a nttElement
+	var j int        // index into a
+	var buf [24]byte // buffered reads from B
+	off := len(buf)  // index into buf, starts in a "buffer fully consumed" state
+	for {
+		if off >= len(buf) {
+			B.Read(buf[:])
+			off = 0
+		}
+		d1 := byteorder.LEUint16(buf[off:]) & 0b1111_1111_1111
+		d2 := byteorder.LEUint16(buf[off+1:]) >> 4
+		off += 3
+		if d1 < q {
+			a[j] = fieldElement(d1)
+			j++
+		}
+		if j >= len(a) {
+			break
+		}
+		if d2 < q {
+			a[j] = fieldElement(d2)
+			j++
+		}
+		if j >= len(a) {
+			break
+		}
+	}
+	return a
+}
\ No newline at end of file
diff --git a/mlkem/mlkem768.go b/mlkem/mlkem768.go
new file mode 100644
index 0000000..2b4fc12
--- /dev/null
+++ b/mlkem/mlkem768.go
@@ -0,0 +1,517 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package mlkem implements the quantum-resistant key encapsulation method
+// ML-KEM (formerly known as Kyber), as specified in [NIST FIPS 203].
+//
+// [NIST FIPS 203]: https://doi.org/10.6028/NIST.FIPS.203
+package mlkem
+
+// This package targets security, correctness, simplicity, readability, and
+// reviewability as its primary goals. All critical operations are performed in
+// constant time.
+//
+// Variable and function names, as well as code layout, are selected to
+// facilitate reviewing the implementation against the NIST FIPS 203 document.
+//
+// Reviewers unfamiliar with polynomials or linear algebra might find the
+// background at https://words.filippo.io/kyber-math/ useful.
+//
+// This file implements the recommended parameter set ML-KEM-768. The ML-KEM-1024
+// parameter set implementation is auto-generated from this file.
+//
+//go:generate go run generate1024.go -input mlkem768.go -output mlkem1024.go
+
+import (
+	"bytes"
+	//"github.com/xtls/reality/fips140"
+	"github.com/xtls/reality/drbg"
+	"github.com/xtls/reality/sha3"
+	"github.com/xtls/reality/subtle"
+	"errors"
+)
+
+const (
+	// ML-KEM global constants.
+	n = 256
+	q = 3329
+
+	// encodingSizeX is the byte size of a ringElement or nttElement encoded
+	// by ByteEncode_X (FIPS 203, Algorithm 5).
+	encodingSize12 = n * 12 / 8
+	encodingSize11 = n * 11 / 8
+	encodingSize10 = n * 10 / 8
+	encodingSize5  = n * 5 / 8
+	encodingSize4  = n * 4 / 8
+	encodingSize1  = n * 1 / 8
+
+	messageSize = encodingSize1
+
+	SharedKeySize = 32
+	SeedSize      = 32 + 32
+)
+
+// ML-KEM-768 parameters.
+const (
+	k = 3
+
+	CiphertextSize768       = k*encodingSize10 + encodingSize4
+	EncapsulationKeySize768 = k*encodingSize12 + 32
+	decapsulationKeySize768 = k*encodingSize12 + EncapsulationKeySize768 + 32 + 32
+)
+
+// ML-KEM-1024 parameters.
+const (
+	k1024 = 4
+
+	CiphertextSize1024       = k1024*encodingSize11 + encodingSize5
+	EncapsulationKeySize1024 = k1024*encodingSize12 + 32
+	decapsulationKeySize1024 = k1024*encodingSize12 + EncapsulationKeySize1024 + 32 + 32
+)
+
+// A DecapsulationKey768 is the secret key used to decapsulate a shared key from a
+// ciphertext. It includes various precomputed values.
+type DecapsulationKey768 struct {
+	d [32]byte // decapsulation key seed
+	z [32]byte // implicit rejection sampling seed
+
+	ρ [32]byte // sampleNTT seed for A, stored for the encapsulation key
+	h [32]byte // H(ek), stored for ML-KEM.Decaps_internal
+
+	encryptionKey
+	decryptionKey
+}
+
+// Bytes returns the decapsulation key as a 64-byte seed in the "d || z" form.
+//
+// The decapsulation key must be kept secret.
+func (dk *DecapsulationKey768) Bytes() []byte {
+	var b [SeedSize]byte
+	copy(b[:], dk.d[:])
+	copy(b[32:], dk.z[:])
+	return b[:]
+}
+
+// TestingOnlyExpandedBytes768 returns the decapsulation key as a byte slice
+// using the full expanded NIST encoding.
+//
+// This should only be used for ACVP testing. For all other purposes prefer
+// the Bytes method that returns the (much smaller) seed.
+func TestingOnlyExpandedBytes768(dk *DecapsulationKey768) []byte {
+	b := make([]byte, 0, decapsulationKeySize768)
+
+	// ByteEncode₁₂(s)
+	for i := range dk.s {
+		b = polyByteEncode(b, dk.s[i])
+	}
+
+	// ByteEncode₁₂(t) || ρ
+	for i := range dk.t {
+		b = polyByteEncode(b, dk.t[i])
+	}
+	b = append(b, dk.ρ[:]...)
+
+	// H(ek) || z
+	b = append(b, dk.h[:]...)
+	b = append(b, dk.z[:]...)
+
+	return b
+}
+
+// EncapsulationKey returns the public encapsulation key necessary to produce
+// ciphertexts.
+func (dk *DecapsulationKey768) EncapsulationKey() *EncapsulationKey768 {
+	return &EncapsulationKey768{
+		ρ:             dk.ρ,
+		h:             dk.h,
+		encryptionKey: dk.encryptionKey,
+	}
+}
+
+// An EncapsulationKey768 is the public key used to produce ciphertexts to be
+// decapsulated by the corresponding [DecapsulationKey768].
+type EncapsulationKey768 struct {
+	ρ [32]byte // sampleNTT seed for A
+	h [32]byte // H(ek)
+	encryptionKey
+}
+
+// Bytes returns the encapsulation key as a byte slice.
+func (ek *EncapsulationKey768) Bytes() []byte {
+	// The actual logic is in a separate function to outline this allocation.
+	b := make([]byte, 0, EncapsulationKeySize768)
+	return ek.bytes(b)
+}
+
+func (ek *EncapsulationKey768) bytes(b []byte) []byte {
+	for i := range ek.t {
+		b = polyByteEncode(b, ek.t[i])
+	}
+	b = append(b, ek.ρ[:]...)
+	return b
+}
+
+// encryptionKey is the parsed and expanded form of a PKE encryption key.
+type encryptionKey struct {
+	t [k]nttElement     // ByteDecode₁₂(ek[:384k])
+	a [k * k]nttElement // A[i*k+j] = sampleNTT(ρ, j, i)
+}
+
+// decryptionKey is the parsed and expanded form of a PKE decryption key.
+type decryptionKey struct {
+	s [k]nttElement // ByteDecode₁₂(dk[:decryptionKeySize])
+}
+
+// GenerateKey768 generates a new decapsulation key, drawing random bytes from
+// a DRBG. The decapsulation key must be kept secret.
+func GenerateKey768() (*DecapsulationKey768, error) {
+	// The actual logic is in a separate function to outline this allocation.
+	dk := &DecapsulationKey768{}
+	return generateKey(dk)
+}
+
+func generateKey(dk *DecapsulationKey768) (*DecapsulationKey768, error) {
+	var d [32]byte
+	drbg.Read(d[:])
+	var z [32]byte
+	drbg.Read(z[:])
+	kemKeyGen(dk, &d, &z)
+	// if err := fips140.PCT("ML-KEM PCT", func() error { return kemPCT(dk) }); err != nil {
+	// 	// This clearly can't happen, but FIPS 140-3 requires us to check.
+	// 	panic(err)
+	// }
+	//fips140.RecordApproved()
+	return dk, nil
+}
+
+// GenerateKeyInternal768 is a derandomized version of GenerateKey768,
+// exclusively for use in tests.
+func GenerateKeyInternal768(d, z *[32]byte) *DecapsulationKey768 {
+	dk := &DecapsulationKey768{}
+	kemKeyGen(dk, d, z)
+	return dk
+}
+
+// NewDecapsulationKey768 parses a decapsulation key from a 64-byte
+// seed in the "d || z" form. The seed must be uniformly random.
+func NewDecapsulationKey768(seed []byte) (*DecapsulationKey768, error) {
+	// The actual logic is in a separate function to outline this allocation.
+	dk := &DecapsulationKey768{}
+	return newKeyFromSeed(dk, seed)
+}
+
+func newKeyFromSeed(dk *DecapsulationKey768, seed []byte) (*DecapsulationKey768, error) {
+	if len(seed) != SeedSize {
+		return nil, errors.New("mlkem: invalid seed length")
+	}
+	d := (*[32]byte)(seed[:32])
+	z := (*[32]byte)(seed[32:])
+	kemKeyGen(dk, d, z)
+	// if err := fips140.PCT("ML-KEM PCT", func() error { return kemPCT(dk) }); err != nil {
+	// 	// This clearly can't happen, but FIPS 140-3 requires us to check.
+	// 	panic(err)
+	// }
+	//fips140.RecordApproved()
+	return dk, nil
+}
+
+// TestingOnlyNewDecapsulationKey768 parses a decapsulation key from its expanded NIST format.
+//
+// Bytes() must not be called on the returned key, as it will not produce the
+// original seed.
+//
+// This function should only be used for ACVP testing. Prefer NewDecapsulationKey768 for all
+// other purposes.
+func TestingOnlyNewDecapsulationKey768(b []byte) (*DecapsulationKey768, error) {
+	if len(b) != decapsulationKeySize768 {
+		return nil, errors.New("mlkem: invalid NIST decapsulation key length")
+	}
+
+	dk := &DecapsulationKey768{}
+	for i := range dk.s {
+		var err error
+		dk.s[i], err = polyByteDecode[nttElement](b[:encodingSize12])
+		if err != nil {
+			return nil, errors.New("mlkem: invalid secret key encoding")
+		}
+		b = b[encodingSize12:]
+	}
+
+	ek, err := NewEncapsulationKey768(b[:EncapsulationKeySize768])
+	if err != nil {
+		return nil, err
+	}
+	dk.ρ = ek.ρ
+	dk.h = ek.h
+	dk.encryptionKey = ek.encryptionKey
+	b = b[EncapsulationKeySize768:]
+
+	if !bytes.Equal(dk.h[:], b[:32]) {
+		return nil, errors.New("mlkem: inconsistent H(ek) in encoded bytes")
+	}
+	b = b[32:]
+
+	copy(dk.z[:], b)
+
+	// Generate a random d value for use in Bytes(). This is a safety mechanism
+	// that avoids returning a broken key vs a random key if this function is
+	// called in contravention of the TestingOnlyNewDecapsulationKey768 function
+	// comment advising against it.
+	drbg.Read(dk.d[:])
+
+	return dk, nil
+}
+
+// kemKeyGen generates a decapsulation key.
+//
+// It implements ML-KEM.KeyGen_internal according to FIPS 203, Algorithm 16, and
+// K-PKE.KeyGen according to FIPS 203, Algorithm 13. The two are merged to save
+// copies and allocations.
+func kemKeyGen(dk *DecapsulationKey768, d, z *[32]byte) {
+	dk.d = *d
+	dk.z = *z
+
+	g := sha3.New512()
+	g.Write(d[:])
+	g.Write([]byte{k}) // Module dimension as a domain separator.
+	G := g.Sum(make([]byte, 0, 64))
+	ρ, σ := G[:32], G[32:]
+	dk.ρ = [32]byte(ρ)
+
+	A := &dk.a
+	for i := byte(0); i < k; i++ {
+		for j := byte(0); j < k; j++ {
+			A[i*k+j] = sampleNTT(ρ, j, i)
+		}
+	}
+
+	var N byte
+	s := &dk.s
+	for i := range s {
+		s[i] = ntt(samplePolyCBD(σ, N))
+		N++
+	}
+	e := make([]nttElement, k)
+	for i := range e {
+		e[i] = ntt(samplePolyCBD(σ, N))
+		N++
+	}
+
+	t := &dk.t
+	for i := range t { // t = A ◦ s + e
+		t[i] = e[i]
+		for j := range s {
+			t[i] = polyAdd(t[i], nttMul(A[i*k+j], s[j]))
+		}
+	}
+
+	H := sha3.New256()
+	ek := dk.EncapsulationKey().Bytes()
+	H.Write(ek)
+	H.Sum(dk.h[:0])
+}
+
+// kemPCT performs a Pairwise Consistency Test per FIPS 140-3 IG 10.3.A
+// Additional Comment 1: "For key pairs generated for use with approved KEMs in
+// FIPS 203, the PCT shall consist of applying the encapsulation key ek to
+// encapsulate a shared secret K leading to ciphertext c, and then applying
+// decapsulation key dk to retrieve the same shared secret K. The PCT passes if
+// the two shared secret K values are equal. The PCT shall be performed either
+// when keys are generated/imported, prior to the first exportation, or prior to
+// the first operational use (if not exported before the first use)."
+func kemPCT(dk *DecapsulationKey768) error {
+	ek := dk.EncapsulationKey()
+	K, c := ek.Encapsulate()
+	K1, err := dk.Decapsulate(c)
+	if err != nil {
+		return err
+	}
+	if subtle.ConstantTimeCompare(K, K1) != 1 {
+		return errors.New("mlkem: PCT failed")
+	}
+	return nil
+}
+
+// Encapsulate generates a shared key and an associated ciphertext from an
+// encapsulation key, drawing random bytes from a DRBG.
+//
+// The shared key must be kept secret.
+func (ek *EncapsulationKey768) Encapsulate() (sharedKey, ciphertext []byte) {
+	// The actual logic is in a separate function to outline this allocation.
+	var cc [CiphertextSize768]byte
+	return ek.encapsulate(&cc)
+}
+
+func (ek *EncapsulationKey768) encapsulate(cc *[CiphertextSize768]byte) (sharedKey, ciphertext []byte) {
+	var m [messageSize]byte
+	drbg.Read(m[:])
+	// Note that the modulus check (step 2 of the encapsulation key check from
+	// FIPS 203, Section 7.2) is performed by polyByteDecode in parseEK.
+	//fips140.RecordApproved()
+	return kemEncaps(cc, ek, &m)
+}
+
+// EncapsulateInternal is a derandomized version of Encapsulate, exclusively for
+// use in tests.
+func (ek *EncapsulationKey768) EncapsulateInternal(m *[32]byte) (sharedKey, ciphertext []byte) {
+	cc := &[CiphertextSize768]byte{}
+	return kemEncaps(cc, ek, m)
+}
+
+// kemEncaps generates a shared key and an associated ciphertext.
+//
+// It implements ML-KEM.Encaps_internal according to FIPS 203, Algorithm 17.
+func kemEncaps(cc *[CiphertextSize768]byte, ek *EncapsulationKey768, m *[messageSize]byte) (K, c []byte) {
+	g := sha3.New512()
+	g.Write(m[:])
+	g.Write(ek.h[:])
+	G := g.Sum(nil)
+	K, r := G[:SharedKeySize], G[SharedKeySize:]
+	c = pkeEncrypt(cc, &ek.encryptionKey, m, r)
+	return K, c
+}
+
+// NewEncapsulationKey768 parses an encapsulation key from its encoded form.
+// If the encapsulation key is not valid, NewEncapsulationKey768 returns an error.
+func NewEncapsulationKey768(encapsulationKey []byte) (*EncapsulationKey768, error) {
+	// The actual logic is in a separate function to outline this allocation.
+	ek := &EncapsulationKey768{}
+	return parseEK(ek, encapsulationKey)
+}
+
+// parseEK parses an encryption key from its encoded form.
+//
+// It implements the initial stages of K-PKE.Encrypt according to FIPS 203,
+// Algorithm 14.
+func parseEK(ek *EncapsulationKey768, ekPKE []byte) (*EncapsulationKey768, error) {
+	if len(ekPKE) != EncapsulationKeySize768 {
+		return nil, errors.New("mlkem: invalid encapsulation key length")
+	}
+
+	h := sha3.New256()
+	h.Write(ekPKE)
+	h.Sum(ek.h[:0])
+
+	for i := range ek.t {
+		var err error
+		ek.t[i], err = polyByteDecode[nttElement](ekPKE[:encodingSize12])
+		if err != nil {
+			return nil, err
+		}
+		ekPKE = ekPKE[encodingSize12:]
+	}
+	copy(ek.ρ[:], ekPKE)
+
+	for i := byte(0); i < k; i++ {
+		for j := byte(0); j < k; j++ {
+			ek.a[i*k+j] = sampleNTT(ek.ρ[:], j, i)
+		}
+	}
+
+	return ek, nil
+}
+
+// pkeEncrypt encrypt a plaintext message.
+//
+// It implements K-PKE.Encrypt according to FIPS 203, Algorithm 14, although the
+// computation of t and AT is done in parseEK.
+func pkeEncrypt(cc *[CiphertextSize768]byte, ex *encryptionKey, m *[messageSize]byte, rnd []byte) []byte {
+	var N byte
+	r, e1 := make([]nttElement, k), make([]ringElement, k)
+	for i := range r {
+		r[i] = ntt(samplePolyCBD(rnd, N))
+		N++
+	}
+	for i := range e1 {
+		e1[i] = samplePolyCBD(rnd, N)
+		N++
+	}
+	e2 := samplePolyCBD(rnd, N)
+
+	u := make([]ringElement, k) // NTT⁻¹(AT ◦ r) + e1
+	for i := range u {
+		u[i] = e1[i]
+		for j := range r {
+			// Note that i and j are inverted, as we need the transposed of A.
+			u[i] = polyAdd(u[i], inverseNTT(nttMul(ex.a[j*k+i], r[j])))
+		}
+	}
+
+	μ := ringDecodeAndDecompress1(m)
+
+	var vNTT nttElement // t⊺ ◦ r
+	for i := range ex.t {
+		vNTT = polyAdd(vNTT, nttMul(ex.t[i], r[i]))
+	}
+	v := polyAdd(polyAdd(inverseNTT(vNTT), e2), μ)
+
+	c := cc[:0]
+	for _, f := range u {
+		c = ringCompressAndEncode10(c, f)
+	}
+	c = ringCompressAndEncode4(c, v)
+
+	return c
+}
+
+// Decapsulate generates a shared key from a ciphertext and a decapsulation key.
+// If the ciphertext is not valid, Decapsulate returns an error.
+//
+// The shared key must be kept secret.
+func (dk *DecapsulationKey768) Decapsulate(ciphertext []byte) (sharedKey []byte, err error) {
+	if len(ciphertext) != CiphertextSize768 {
+		return nil, errors.New("mlkem: invalid ciphertext length")
+	}
+	c := (*[CiphertextSize768]byte)(ciphertext)
+	// Note that the hash check (step 3 of the decapsulation input check from
+	// FIPS 203, Section 7.3) is foregone as a DecapsulationKey is always
+	// validly generated by ML-KEM.KeyGen_internal.
+	return kemDecaps(dk, c), nil
+}
+
+// kemDecaps produces a shared key from a ciphertext.
+//
+// It implements ML-KEM.Decaps_internal according to FIPS 203, Algorithm 18.
+func kemDecaps(dk *DecapsulationKey768, c *[CiphertextSize768]byte) (K []byte) {
+	//fips140.RecordApproved()
+	m := pkeDecrypt(&dk.decryptionKey, c)
+	g := sha3.New512()
+	g.Write(m[:])
+	g.Write(dk.h[:])
+	G := g.Sum(make([]byte, 0, 64))
+	Kprime, r := G[:SharedKeySize], G[SharedKeySize:]
+	J := sha3.NewShake256()
+	J.Write(dk.z[:])
+	J.Write(c[:])
+	Kout := make([]byte, SharedKeySize)
+	J.Read(Kout)
+	var cc [CiphertextSize768]byte
+	c1 := pkeEncrypt(&cc, &dk.encryptionKey, (*[32]byte)(m), r)
+
+	subtle.ConstantTimeCopy(subtle.ConstantTimeCompare(c[:], c1), Kout, Kprime)
+	return Kout
+}
+
+// pkeDecrypt decrypts a ciphertext.
+//
+// It implements K-PKE.Decrypt according to FIPS 203, Algorithm 15,
+// although s is retained from kemKeyGen.
+func pkeDecrypt(dx *decryptionKey, c *[CiphertextSize768]byte) []byte {
+	u := make([]ringElement, k)
+	for i := range u {
+		b := (*[encodingSize10]byte)(c[encodingSize10*i : encodingSize10*(i+1)])
+		u[i] = ringDecodeAndDecompress10(b)
+	}
+
+	b := (*[encodingSize4]byte)(c[encodingSize10*k:])
+	v := ringDecodeAndDecompress4(b)
+
+	var mask nttElement // s⊺ ◦ NTT(u)
+	for i := range dx.s {
+		mask = polyAdd(mask, nttMul(dx.s[i], ntt(u[i])))
+	}
+	w := polySub(v, inverseNTT(mask))
+
+	return ringCompressAndEncode1(nil, w)
+}
\ No newline at end of file
diff --git a/mlkem768/mlkem768.go b/mlkem768/mlkem768.go
deleted file mode 100644
index 24bedea..0000000
--- a/mlkem768/mlkem768.go
+++ /dev/null
@@ -1,886 +0,0 @@
-// Copyright 2023 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package mlkem768 implements the quantum-resistant key encapsulation method
-// ML-KEM (formerly known as Kyber).
-//
-// Only the recommended ML-KEM-768 parameter set is provided.
-//
-// The version currently implemented is the one specified by [NIST FIPS 203 ipd],
-// with the unintentional transposition of the matrix A reverted to match the
-// behavior of [Kyber version 3.0]. Future versions of this package might
-// introduce backwards incompatible changes to implement changes to FIPS 203.
-//
-// [Kyber version 3.0]: https://pq-crystals.org/kyber/data/kyber-specification-round3-20210804.pdf
-// [NIST FIPS 203 ipd]: https://doi.org/10.6028/NIST.FIPS.203.ipd
-package mlkem768
-
-// This package targets security, correctness, simplicity, readability, and
-// reviewability as its primary goals. All critical operations are performed in
-// constant time.
-//
-// Variable and function names, as well as code layout, are selected to
-// facilitate reviewing the implementation against the NIST FIPS 203 ipd
-// document.
-//
-// Reviewers unfamiliar with polynomials or linear algebra might find the
-// background at https://words.filippo.io/kyber-math/ useful.
-
-import (
-	"crypto/rand"
-	"crypto/subtle"
-	"encoding/binary"
-	"errors"
-
-	"golang.org/x/crypto/sha3"
-)
-
-const (
-	// ML-KEM global constants.
-	n = 256
-	q = 3329
-
-	log2q = 12
-
-	// ML-KEM-768 parameters. The code makes assumptions based on these values,
-	// they can't be changed blindly.
-	k  = 3
-	η  = 2
-	du = 10
-	dv = 4
-
-	// encodingSizeX is the byte size of a ringElement or nttElement encoded
-	// by ByteEncode_X (FIPS 203 (DRAFT), Algorithm 4).
-	encodingSize12 = n * log2q / 8
-	encodingSize10 = n * du / 8
-	encodingSize4  = n * dv / 8
-	encodingSize1  = n * 1 / 8
-
-	messageSize       = encodingSize1
-	decryptionKeySize = k * encodingSize12
-	encryptionKeySize = k*encodingSize12 + 32
-
-	CiphertextSize       = k*encodingSize10 + encodingSize4
-	EncapsulationKeySize = encryptionKeySize
-	DecapsulationKeySize = decryptionKeySize + encryptionKeySize + 32 + 32
-	SharedKeySize        = 32
-	SeedSize             = 32 + 32
-)
-
-// A DecapsulationKey is the secret key used to decapsulate a shared key from a
-// ciphertext. It includes various precomputed values.
-type DecapsulationKey struct {
-	dk [DecapsulationKeySize]byte
-	encryptionKey
-	decryptionKey
-}
-
-// Bytes returns the extended encoding of the decapsulation key, according to
-// FIPS 203 (DRAFT).
-func (dk *DecapsulationKey) Bytes() []byte {
-	var b [DecapsulationKeySize]byte
-	copy(b[:], dk.dk[:])
-	return b[:]
-}
-
-// EncapsulationKey returns the public encapsulation key necessary to produce
-// ciphertexts.
-func (dk *DecapsulationKey) EncapsulationKey() []byte {
-	var b [EncapsulationKeySize]byte
-	copy(b[:], dk.dk[decryptionKeySize:])
-	return b[:]
-}
-
-// encryptionKey is the parsed and expanded form of a PKE encryption key.
-type encryptionKey struct {
-	t [k]nttElement     // ByteDecode₁₂(ek[:384k])
-	A [k * k]nttElement // A[i*k+j] = sampleNTT(ρ, j, i)
-}
-
-// decryptionKey is the parsed and expanded form of a PKE decryption key.
-type decryptionKey struct {
-	s [k]nttElement // ByteDecode₁₂(dk[:decryptionKeySize])
-}
-
-// GenerateKey generates a new decapsulation key, drawing random bytes from
-// crypto/rand. The decapsulation key must be kept secret.
-func GenerateKey() (*DecapsulationKey, error) {
-	// The actual logic is in a separate function to outline this allocation.
-	dk := &DecapsulationKey{}
-	return generateKey(dk)
-}
-
-func generateKey(dk *DecapsulationKey) (*DecapsulationKey, error) {
-	var d [32]byte
-	if _, err := rand.Read(d[:]); err != nil {
-		return nil, errors.New("mlkem768: crypto/rand Read failed: " + err.Error())
-	}
-	var z [32]byte
-	if _, err := rand.Read(z[:]); err != nil {
-		return nil, errors.New("mlkem768: crypto/rand Read failed: " + err.Error())
-	}
-	return kemKeyGen(dk, &d, &z), nil
-}
-
-// NewKeyFromSeed deterministically generates a decapsulation key from a 64-byte
-// seed in the "d || z" form. The seed must be uniformly random.
-func NewKeyFromSeed(seed []byte) (*DecapsulationKey, error) {
-	// The actual logic is in a separate function to outline this allocation.
-	dk := &DecapsulationKey{}
-	return newKeyFromSeed(dk, seed)
-}
-
-func newKeyFromSeed(dk *DecapsulationKey, seed []byte) (*DecapsulationKey, error) {
-	if len(seed) != SeedSize {
-		return nil, errors.New("mlkem768: invalid seed length")
-	}
-	d := (*[32]byte)(seed[:32])
-	z := (*[32]byte)(seed[32:])
-	return kemKeyGen(dk, d, z), nil
-}
-
-// NewKeyFromExtendedEncoding parses a decapsulation key from its FIPS 203
-// (DRAFT) extended encoding.
-func NewKeyFromExtendedEncoding(decapsulationKey []byte) (*DecapsulationKey, error) {
-	// The actual logic is in a separate function to outline this allocation.
-	dk := &DecapsulationKey{}
-	return newKeyFromExtendedEncoding(dk, decapsulationKey)
-}
-
-func newKeyFromExtendedEncoding(dk *DecapsulationKey, dkBytes []byte) (*DecapsulationKey, error) {
-	if len(dkBytes) != DecapsulationKeySize {
-		return nil, errors.New("mlkem768: invalid decapsulation key length")
-	}
-
-	// Note that we don't check that H(ek) matches ekPKE, as that's not
-	// specified in FIPS 203 (DRAFT). This is one reason to prefer the seed
-	// private key format.
-	dk.dk = [DecapsulationKeySize]byte(dkBytes)
-
-	dkPKE := dkBytes[:decryptionKeySize]
-	if err := parseDK(&dk.decryptionKey, dkPKE); err != nil {
-		return nil, err
-	}
-
-	ekPKE := dkBytes[decryptionKeySize : decryptionKeySize+encryptionKeySize]
-	if err := parseEK(&dk.encryptionKey, ekPKE); err != nil {
-		return nil, err
-	}
-
-	return dk, nil
-}
-
-// kemKeyGen generates a decapsulation key.
-//
-// It implements ML-KEM.KeyGen according to FIPS 203 (DRAFT), Algorithm 15, and
-// K-PKE.KeyGen according to FIPS 203 (DRAFT), Algorithm 12. The two are merged
-// to save copies and allocations.
-func kemKeyGen(dk *DecapsulationKey, d, z *[32]byte) *DecapsulationKey {
-	if dk == nil {
-		dk = &DecapsulationKey{}
-	}
-
-	G := sha3.Sum512(d[:])
-	ρ, σ := G[:32], G[32:]
-
-	A := &dk.A
-	for i := byte(0); i < k; i++ {
-		for j := byte(0); j < k; j++ {
-			// Note that this is consistent with Kyber round 3, rather than with
-			// the initial draft of FIPS 203, because NIST signaled that the
-			// change was involuntary and will be reverted.
-			A[i*k+j] = sampleNTT(ρ, j, i)
-		}
-	}
-
-	var N byte
-	s := &dk.s
-	for i := range s {
-		s[i] = ntt(samplePolyCBD(σ, N))
-		N++
-	}
-	e := make([]nttElement, k)
-	for i := range e {
-		e[i] = ntt(samplePolyCBD(σ, N))
-		N++
-	}
-
-	t := &dk.t
-	for i := range t { // t = A ◦ s + e
-		t[i] = e[i]
-		for j := range s {
-			t[i] = polyAdd(t[i], nttMul(A[i*k+j], s[j]))
-		}
-	}
-
-	// dkPKE ← ByteEncode₁₂(s)
-	// ekPKE ← ByteEncode₁₂(t) || ρ
-	// ek ← ekPKE
-	// dk ← dkPKE || ek || H(ek) || z
-	dkB := dk.dk[:0]
-
-	for i := range s {
-		dkB = polyByteEncode(dkB, s[i])
-	}
-
-	for i := range t {
-		dkB = polyByteEncode(dkB, t[i])
-	}
-	dkB = append(dkB, ρ...)
-
-	H := sha3.New256()
-	H.Write(dkB[decryptionKeySize:])
-	dkB = H.Sum(dkB)
-
-	dkB = append(dkB, z[:]...)
-
-	if len(dkB) != len(dk.dk) {
-		panic("mlkem768: internal error: invalid decapsulation key size")
-	}
-
-	return dk
-}
-
-// Encapsulate generates a shared key and an associated ciphertext from an
-// encapsulation key, drawing random bytes from crypto/rand.
-// If the encapsulation key is not valid, Encapsulate returns an error.
-//
-// The shared key must be kept secret.
-func Encapsulate(encapsulationKey []byte) (ciphertext, sharedKey []byte, err error) {
-	// The actual logic is in a separate function to outline this allocation.
-	var cc [CiphertextSize]byte
-	return encapsulate(&cc, encapsulationKey)
-}
-
-func encapsulate(cc *[CiphertextSize]byte, encapsulationKey []byte) (ciphertext, sharedKey []byte, err error) {
-	if len(encapsulationKey) != EncapsulationKeySize {
-		return nil, nil, errors.New("mlkem768: invalid encapsulation key length")
-	}
-	var m [messageSize]byte
-	if _, err := rand.Read(m[:]); err != nil {
-		return nil, nil, errors.New("mlkem768: crypto/rand Read failed: " + err.Error())
-	}
-	return kemEncaps(cc, encapsulationKey, &m)
-}
-
-// kemEncaps generates a shared key and an associated ciphertext.
-//
-// It implements ML-KEM.Encaps according to FIPS 203 (DRAFT), Algorithm 16.
-func kemEncaps(cc *[CiphertextSize]byte, ek []byte, m *[messageSize]byte) (c, K []byte, err error) {
-	if cc == nil {
-		cc = &[CiphertextSize]byte{}
-	}
-
-	H := sha3.Sum256(ek[:])
-	g := sha3.New512()
-	g.Write(m[:])
-	g.Write(H[:])
-	G := g.Sum(nil)
-	K, r := G[:SharedKeySize], G[SharedKeySize:]
-	var ex encryptionKey
-	if err := parseEK(&ex, ek[:]); err != nil {
-		return nil, nil, err
-	}
-	c = pkeEncrypt(cc, &ex, m, r)
-	return c, K, nil
-}
-
-// parseEK parses an encryption key from its encoded form.
-//
-// It implements the initial stages of K-PKE.Encrypt according to FIPS 203
-// (DRAFT), Algorithm 13.
-func parseEK(ex *encryptionKey, ekPKE []byte) error {
-	if len(ekPKE) != encryptionKeySize {
-		return errors.New("mlkem768: invalid encryption key length")
-	}
-
-	for i := range ex.t {
-		var err error
-		ex.t[i], err = polyByteDecode[nttElement](ekPKE[:encodingSize12])
-		if err != nil {
-			return err
-		}
-		ekPKE = ekPKE[encodingSize12:]
-	}
-	ρ := ekPKE
-
-	for i := byte(0); i < k; i++ {
-		for j := byte(0); j < k; j++ {
-			// See the note in pkeKeyGen about the order of the indices being
-			// consistent with Kyber round 3.
-			ex.A[i*k+j] = sampleNTT(ρ, j, i)
-		}
-	}
-
-	return nil
-}
-
-// pkeEncrypt encrypt a plaintext message.
-//
-// It implements K-PKE.Encrypt according to FIPS 203 (DRAFT), Algorithm 13,
-// although the computation of t and AT is done in parseEK.
-func pkeEncrypt(cc *[CiphertextSize]byte, ex *encryptionKey, m *[messageSize]byte, rnd []byte) []byte {
-	var N byte
-	r, e1 := make([]nttElement, k), make([]ringElement, k)
-	for i := range r {
-		r[i] = ntt(samplePolyCBD(rnd, N))
-		N++
-	}
-	for i := range e1 {
-		e1[i] = samplePolyCBD(rnd, N)
-		N++
-	}
-	e2 := samplePolyCBD(rnd, N)
-
-	u := make([]ringElement, k) // NTT⁻¹(AT ◦ r) + e1
-	for i := range u {
-		u[i] = e1[i]
-		for j := range r {
-			// Note that i and j are inverted, as we need the transposed of A.
-			u[i] = polyAdd(u[i], inverseNTT(nttMul(ex.A[j*k+i], r[j])))
-		}
-	}
-
-	μ := ringDecodeAndDecompress1(m)
-
-	var vNTT nttElement // t⊺ ◦ r
-	for i := range ex.t {
-		vNTT = polyAdd(vNTT, nttMul(ex.t[i], r[i]))
-	}
-	v := polyAdd(polyAdd(inverseNTT(vNTT), e2), μ)
-
-	c := cc[:0]
-	for _, f := range u {
-		c = ringCompressAndEncode10(c, f)
-	}
-	c = ringCompressAndEncode4(c, v)
-
-	return c
-}
-
-// Decapsulate generates a shared key from a ciphertext and a decapsulation key.
-// If the ciphertext is not valid, Decapsulate returns an error.
-//
-// The shared key must be kept secret.
-func Decapsulate(dk *DecapsulationKey, ciphertext []byte) (sharedKey []byte, err error) {
-	if len(ciphertext) != CiphertextSize {
-		return nil, errors.New("mlkem768: invalid ciphertext length")
-	}
-	c := (*[CiphertextSize]byte)(ciphertext)
-	return kemDecaps(dk, c), nil
-}
-
-// kemDecaps produces a shared key from a ciphertext.
-//
-// It implements ML-KEM.Decaps according to FIPS 203 (DRAFT), Algorithm 17.
-func kemDecaps(dk *DecapsulationKey, c *[CiphertextSize]byte) (K []byte) {
-	h := dk.dk[decryptionKeySize+encryptionKeySize : decryptionKeySize+encryptionKeySize+32]
-	z := dk.dk[decryptionKeySize+encryptionKeySize+32:]
-
-	m := pkeDecrypt(&dk.decryptionKey, c)
-	g := sha3.New512()
-	g.Write(m[:])
-	g.Write(h)
-	G := g.Sum(nil)
-	Kprime, r := G[:SharedKeySize], G[SharedKeySize:]
-	J := sha3.NewShake256()
-	J.Write(z)
-	J.Write(c[:])
-	Kout := make([]byte, SharedKeySize)
-	J.Read(Kout)
-	var cc [CiphertextSize]byte
-	c1 := pkeEncrypt(&cc, &dk.encryptionKey, (*[32]byte)(m), r)
-
-	subtle.ConstantTimeCopy(subtle.ConstantTimeCompare(c[:], c1), Kout, Kprime)
-	return Kout
-}
-
-// parseDK parses a decryption key from its encoded form.
-//
-// It implements the computation of s from K-PKE.Decrypt according to FIPS 203
-// (DRAFT), Algorithm 14.
-func parseDK(dx *decryptionKey, dkPKE []byte) error {
-	if len(dkPKE) != decryptionKeySize {
-		return errors.New("mlkem768: invalid decryption key length")
-	}
-
-	for i := range dx.s {
-		f, err := polyByteDecode[nttElement](dkPKE[:encodingSize12])
-		if err != nil {
-			return err
-		}
-		dx.s[i] = f
-		dkPKE = dkPKE[encodingSize12:]
-	}
-
-	return nil
-}
-
-// pkeDecrypt decrypts a ciphertext.
-//
-// It implements K-PKE.Decrypt according to FIPS 203 (DRAFT), Algorithm 14,
-// although the computation of s is done in parseDK.
-func pkeDecrypt(dx *decryptionKey, c *[CiphertextSize]byte) []byte {
-	u := make([]ringElement, k)
-	for i := range u {
-		b := (*[encodingSize10]byte)(c[encodingSize10*i : encodingSize10*(i+1)])
-		u[i] = ringDecodeAndDecompress10(b)
-	}
-
-	b := (*[encodingSize4]byte)(c[encodingSize10*k:])
-	v := ringDecodeAndDecompress4(b)
-
-	var mask nttElement // s⊺ ◦ NTT(u)
-	for i := range dx.s {
-		mask = polyAdd(mask, nttMul(dx.s[i], ntt(u[i])))
-	}
-	w := polySub(v, inverseNTT(mask))
-
-	return ringCompressAndEncode1(nil, w)
-}
-
-// fieldElement is an integer modulo q, an element of ℤ_q. It is always reduced.
-type fieldElement uint16
-
-// fieldCheckReduced checks that a value a is < q.
-func fieldCheckReduced(a uint16) (fieldElement, error) {
-	if a >= q {
-		return 0, errors.New("unreduced field element")
-	}
-	return fieldElement(a), nil
-}
-
-// fieldReduceOnce reduces a value a < 2q.
-func fieldReduceOnce(a uint16) fieldElement {
-	x := a - q
-	// If x underflowed, then x >= 2¹⁶ - q > 2¹⁵, so the top bit is set.
-	x += (x >> 15) * q
-	return fieldElement(x)
-}
-
-func fieldAdd(a, b fieldElement) fieldElement {
-	x := uint16(a + b)
-	return fieldReduceOnce(x)
-}
-
-func fieldSub(a, b fieldElement) fieldElement {
-	x := uint16(a - b + q)
-	return fieldReduceOnce(x)
-}
-
-const (
-	barrettMultiplier = 5039 // 2¹² * 2¹² / q
-	barrettShift      = 24   // log₂(2¹² * 2¹²)
-)
-
-// fieldReduce reduces a value a < 2q² using Barrett reduction, to avoid
-// potentially variable-time division.
-func fieldReduce(a uint32) fieldElement {
-	quotient := uint32((uint64(a) * barrettMultiplier) >> barrettShift)
-	return fieldReduceOnce(uint16(a - quotient*q))
-}
-
-func fieldMul(a, b fieldElement) fieldElement {
-	x := uint32(a) * uint32(b)
-	return fieldReduce(x)
-}
-
-// fieldMulSub returns a * (b - c). This operation is fused to save a
-// fieldReduceOnce after the subtraction.
-func fieldMulSub(a, b, c fieldElement) fieldElement {
-	x := uint32(a) * uint32(b-c+q)
-	return fieldReduce(x)
-}
-
-// fieldAddMul returns a * b + c * d. This operation is fused to save a
-// fieldReduceOnce and a fieldReduce.
-func fieldAddMul(a, b, c, d fieldElement) fieldElement {
-	x := uint32(a) * uint32(b)
-	x += uint32(c) * uint32(d)
-	return fieldReduce(x)
-}
-
-// compress maps a field element uniformly to the range 0 to 2ᵈ-1, according to
-// FIPS 203 (DRAFT), Definition 4.5.
-func compress(x fieldElement, d uint8) uint16 {
-	// We want to compute (x * 2ᵈ) / q, rounded to nearest integer, with 1/2
-	// rounding up (see FIPS 203 (DRAFT), Section 2.3).
-
-	// Barrett reduction produces a quotient and a remainder in the range [0, 2q),
-	// such that dividend = quotient * q + remainder.
-	dividend := uint32(x) << d // x * 2ᵈ
-	quotient := uint32(uint64(dividend) * barrettMultiplier >> barrettShift)
-	remainder := dividend - quotient*q
-
-	// Since the remainder is in the range [0, 2q), not [0, q), we need to
-	// portion it into three spans for rounding.
-	//
-	//     [ 0,       q/2     ) -> round to 0
-	//     [ q/2,     q + q/2 ) -> round to 1
-	//     [ q + q/2, 2q      ) -> round to 2
-	//
-	// We can convert that to the following logic: add 1 if remainder > q/2,
-	// then add 1 again if remainder > q + q/2.
-	//
-	// Note that if remainder > x, then ⌊x⌋ - remainder underflows, and the top
-	// bit of the difference will be set.
-	quotient += (q/2 - remainder) >> 31 & 1
-	quotient += (q + q/2 - remainder) >> 31 & 1
-
-	// quotient might have overflowed at this point, so reduce it by masking.
-	var mask uint32 = (1 << d) - 1
-	return uint16(quotient & mask)
-}
-
-// decompress maps a number x between 0 and 2ᵈ-1 uniformly to the full range of
-// field elements, according to FIPS 203 (DRAFT), Definition 4.6.
-func decompress(y uint16, d uint8) fieldElement {
-	// We want to compute (y * q) / 2ᵈ, rounded to nearest integer, with 1/2
-	// rounding up (see FIPS 203 (DRAFT), Section 2.3).
-
-	dividend := uint32(y) * q
-	quotient := dividend >> d // (y * q) / 2ᵈ
-
-	// The d'th least-significant bit of the dividend (the most significant bit
-	// of the remainder) is 1 for the top half of the values that divide to the
-	// same quotient, which are the ones that round up.
-	quotient += dividend >> (d - 1) & 1
-
-	// quotient is at most (2¹¹-1) * q / 2¹¹ + 1 = 3328, so it didn't overflow.
-	return fieldElement(quotient)
-}
-
-// ringElement is a polynomial, an element of R_q, represented as an array
-// according to FIPS 203 (DRAFT), Section 2.4.
-type ringElement [n]fieldElement
-
-// polyAdd adds two ringElements or nttElements.
-func polyAdd[T ~[n]fieldElement](a, b T) (s T) {
-	for i := range s {
-		s[i] = fieldAdd(a[i], b[i])
-	}
-	return s
-}
-
-// polySub subtracts two ringElements or nttElements.
-func polySub[T ~[n]fieldElement](a, b T) (s T) {
-	for i := range s {
-		s[i] = fieldSub(a[i], b[i])
-	}
-	return s
-}
-
-// polyByteEncode appends the 384-byte encoding of f to b.
-//
-// It implements ByteEncode₁₂, according to FIPS 203 (DRAFT), Algorithm 4.
-func polyByteEncode[T ~[n]fieldElement](b []byte, f T) []byte {
-	out, B := sliceForAppend(b, encodingSize12)
-	for i := 0; i < n; i += 2 {
-		x := uint32(f[i]) | uint32(f[i+1])<<12
-		B[0] = uint8(x)
-		B[1] = uint8(x >> 8)
-		B[2] = uint8(x >> 16)
-		B = B[3:]
-	}
-	return out
-}
-
-// polyByteDecode decodes the 384-byte encoding of a polynomial, checking that
-// all the coefficients are properly reduced. This achieves the "Modulus check"
-// step of ML-KEM Encapsulation Input Validation.
-//
-// polyByteDecode is also used in ML-KEM Decapsulation, where the input
-// validation is not required, but implicitly allowed by the specification.
-//
-// It implements ByteDecode₁₂, according to FIPS 203 (DRAFT), Algorithm 5.
-func polyByteDecode[T ~[n]fieldElement](b []byte) (T, error) {
-	if len(b) != encodingSize12 {
-		return T{}, errors.New("mlkem768: invalid encoding length")
-	}
-	var f T
-	for i := 0; i < n; i += 2 {
-		d := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16
-		const mask12 = 0b1111_1111_1111
-		var err error
-		if f[i], err = fieldCheckReduced(uint16(d & mask12)); err != nil {
-			return T{}, errors.New("mlkem768: invalid polynomial encoding")
-		}
-		if f[i+1], err = fieldCheckReduced(uint16(d >> 12)); err != nil {
-			return T{}, errors.New("mlkem768: invalid polynomial encoding")
-		}
-		b = b[3:]
-	}
-	return f, nil
-}
-
-// sliceForAppend takes a slice and a requested number of bytes. It returns a
-// slice with the contents of the given slice followed by that many bytes and a
-// second slice that aliases into it and contains only the extra bytes. If the
-// original slice has sufficient capacity then no allocation is performed.
-func sliceForAppend(in []byte, n int) (head, tail []byte) {
-	if total := len(in) + n; cap(in) >= total {
-		head = in[:total]
-	} else {
-		head = make([]byte, total)
-		copy(head, in)
-	}
-	tail = head[len(in):]
-	return
-}
-
-// ringCompressAndEncode1 appends a 32-byte encoding of a ring element to s,
-// compressing one coefficients per bit.
-//
-// It implements Compress₁, according to FIPS 203 (DRAFT), Definition 4.5,
-// followed by ByteEncode₁, according to FIPS 203 (DRAFT), Algorithm 4.
-func ringCompressAndEncode1(s []byte, f ringElement) []byte {
-	s, b := sliceForAppend(s, encodingSize1)
-	for i := range b {
-		b[i] = 0
-	}
-	for i := range f {
-		b[i/8] |= uint8(compress(f[i], 1) << (i % 8))
-	}
-	return s
-}
-
-// ringDecodeAndDecompress1 decodes a 32-byte slice to a ring element where each
-// bit is mapped to 0 or ⌈q/2⌋.
-//
-// It implements ByteDecode₁, according to FIPS 203 (DRAFT), Algorithm 5,
-// followed by Decompress₁, according to FIPS 203 (DRAFT), Definition 4.6.
-func ringDecodeAndDecompress1(b *[encodingSize1]byte) ringElement {
-	var f ringElement
-	for i := range f {
-		b_i := b[i/8] >> (i % 8) & 1
-		const halfQ = (q + 1) / 2        // ⌈q/2⌋, rounded up per FIPS 203 (DRAFT), Section 2.3
-		f[i] = fieldElement(b_i) * halfQ // 0 decompresses to 0, and 1 to ⌈q/2⌋
-	}
-	return f
-}
-
-// ringCompressAndEncode4 appends a 128-byte encoding of a ring element to s,
-// compressing two coefficients per byte.
-//
-// It implements Compress₄, according to FIPS 203 (DRAFT), Definition 4.5,
-// followed by ByteEncode₄, according to FIPS 203 (DRAFT), Algorithm 4.
-func ringCompressAndEncode4(s []byte, f ringElement) []byte {
-	s, b := sliceForAppend(s, encodingSize4)
-	for i := 0; i < n; i += 2 {
-		b[i/2] = uint8(compress(f[i], 4) | compress(f[i+1], 4)<<4)
-	}
-	return s
-}
-
-// ringDecodeAndDecompress4 decodes a 128-byte encoding of a ring element where
-// each four bits are mapped to an equidistant distribution.
-//
-// It implements ByteDecode₄, according to FIPS 203 (DRAFT), Algorithm 5,
-// followed by Decompress₄, according to FIPS 203 (DRAFT), Definition 4.6.
-func ringDecodeAndDecompress4(b *[encodingSize4]byte) ringElement {
-	var f ringElement
-	for i := 0; i < n; i += 2 {
-		f[i] = fieldElement(decompress(uint16(b[i/2]&0b1111), 4))
-		f[i+1] = fieldElement(decompress(uint16(b[i/2]>>4), 4))
-	}
-	return f
-}
-
-// ringCompressAndEncode10 appends a 320-byte encoding of a ring element to s,
-// compressing four coefficients per five bytes.
-//
-// It implements Compress₁₀, according to FIPS 203 (DRAFT), Definition 4.5,
-// followed by ByteEncode₁₀, according to FIPS 203 (DRAFT), Algorithm 4.
-func ringCompressAndEncode10(s []byte, f ringElement) []byte {
-	s, b := sliceForAppend(s, encodingSize10)
-	for i := 0; i < n; i += 4 {
-		var x uint64
-		x |= uint64(compress(f[i+0], 10))
-		x |= uint64(compress(f[i+1], 10)) << 10
-		x |= uint64(compress(f[i+2], 10)) << 20
-		x |= uint64(compress(f[i+3], 10)) << 30
-		b[0] = uint8(x)
-		b[1] = uint8(x >> 8)
-		b[2] = uint8(x >> 16)
-		b[3] = uint8(x >> 24)
-		b[4] = uint8(x >> 32)
-		b = b[5:]
-	}
-	return s
-}
-
-// ringDecodeAndDecompress10 decodes a 320-byte encoding of a ring element where
-// each ten bits are mapped to an equidistant distribution.
-//
-// It implements ByteDecode₁₀, according to FIPS 203 (DRAFT), Algorithm 5,
-// followed by Decompress₁₀, according to FIPS 203 (DRAFT), Definition 4.6.
-func ringDecodeAndDecompress10(bb *[encodingSize10]byte) ringElement {
-	b := bb[:]
-	var f ringElement
-	for i := 0; i < n; i += 4 {
-		x := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32
-		b = b[5:]
-		f[i] = fieldElement(decompress(uint16(x>>0&0b11_1111_1111), 10))
-		f[i+1] = fieldElement(decompress(uint16(x>>10&0b11_1111_1111), 10))
-		f[i+2] = fieldElement(decompress(uint16(x>>20&0b11_1111_1111), 10))
-		f[i+3] = fieldElement(decompress(uint16(x>>30&0b11_1111_1111), 10))
-	}
-	return f
-}
-
-// samplePolyCBD draws a ringElement from the special Dη distribution given a
-// stream of random bytes generated by the PRF function, according to FIPS 203
-// (DRAFT), Algorithm 7 and Definition 4.1.
-func samplePolyCBD(s []byte, b byte) ringElement {
-	prf := sha3.NewShake256()
-	prf.Write(s)
-	prf.Write([]byte{b})
-	B := make([]byte, 128)
-	prf.Read(B)
-
-	// SamplePolyCBD simply draws four (2η) bits for each coefficient, and adds
-	// the first two and subtracts the last two.
-
-	var f ringElement
-	for i := 0; i < n; i += 2 {
-		b := B[i/2]
-		b_7, b_6, b_5, b_4 := b>>7, b>>6&1, b>>5&1, b>>4&1
-		b_3, b_2, b_1, b_0 := b>>3&1, b>>2&1, b>>1&1, b&1
-		f[i] = fieldSub(fieldElement(b_0+b_1), fieldElement(b_2+b_3))
-		f[i+1] = fieldSub(fieldElement(b_4+b_5), fieldElement(b_6+b_7))
-	}
-	return f
-}
-
-// nttElement is an NTT representation, an element of T_q, represented as an
-// array according to FIPS 203 (DRAFT), Section 2.4.
-type nttElement [n]fieldElement
-
-// gammas are the values ζ^2BitRev7(i)+1 mod q for each index i.
-var gammas = [128]fieldElement{17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}
-
-// nttMul multiplies two nttElements.
-//
-// It implements MultiplyNTTs, according to FIPS 203 (DRAFT), Algorithm 10.
-func nttMul(f, g nttElement) nttElement {
-	var h nttElement
-	// We use i += 2 for bounds check elimination. See https://go.dev/issue/66826.
-	for i := 0; i < 256; i += 2 {
-		a0, a1 := f[i], f[i+1]
-		b0, b1 := g[i], g[i+1]
-		h[i] = fieldAddMul(a0, b0, fieldMul(a1, b1), gammas[i/2])
-		h[i+1] = fieldAddMul(a0, b1, a1, b0)
-	}
-	return h
-}
-
-// zetas are the values ζ^BitRev7(k) mod q for each index k.
-var zetas = [128]fieldElement{1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}
-
-// ntt maps a ringElement to its nttElement representation.
-//
-// It implements NTT, according to FIPS 203 (DRAFT), Algorithm 8.
-func ntt(f ringElement) nttElement {
-	k := 1
-	for len := 128; len >= 2; len /= 2 {
-		for start := 0; start < 256; start += 2 * len {
-			zeta := zetas[k]
-			k++
-			// Bounds check elimination hint.
-			f, flen := f[start:start+len], f[start+len:start+len+len]
-			for j := 0; j < len; j++ {
-				t := fieldMul(zeta, flen[j])
-				flen[j] = fieldSub(f[j], t)
-				f[j] = fieldAdd(f[j], t)
-			}
-		}
-	}
-	return nttElement(f)
-}
-
-// inverseNTT maps a nttElement back to the ringElement it represents.
-//
-// It implements NTT⁻¹, according to FIPS 203 (DRAFT), Algorithm 9.
-func inverseNTT(f nttElement) ringElement {
-	k := 127
-	for len := 2; len <= 128; len *= 2 {
-		for start := 0; start < 256; start += 2 * len {
-			zeta := zetas[k]
-			k--
-			// Bounds check elimination hint.
-			f, flen := f[start:start+len], f[start+len:start+len+len]
-			for j := 0; j < len; j++ {
-				t := f[j]
-				f[j] = fieldAdd(t, flen[j])
-				flen[j] = fieldMulSub(zeta, flen[j], t)
-			}
-		}
-	}
-	for i := range f {
-		f[i] = fieldMul(f[i], 3303) // 3303 = 128⁻¹ mod q
-	}
-	return ringElement(f)
-}
-
-// sampleNTT draws a uniformly random nttElement from a stream of uniformly
-// random bytes generated by the XOF function, according to FIPS 203 (DRAFT),
-// Algorithm 6 and Definition 4.2.
-func sampleNTT(rho []byte, ii, jj byte) nttElement {
-	B := sha3.NewShake128()
-	B.Write(rho)
-	B.Write([]byte{ii, jj})
-
-	// SampleNTT essentially draws 12 bits at a time from r, interprets them in
-	// little-endian, and rejects values higher than q, until it drew 256
-	// values. (The rejection rate is approximately 19%.)
-	//
-	// To do this from a bytes stream, it draws three bytes at a time, and
-	// splits them into two uint16 appropriately masked.
-	//
-	//               r₀              r₁              r₂
-	//       |- - - - - - - -|- - - - - - - -|- - - - - - - -|
-	//
-	//               Uint16(r₀ || r₁)
-	//       |- - - - - - - - - - - - - - - -|
-	//       |- - - - - - - - - - - -|
-	//                   d₁
-	//
-	//                                Uint16(r₁ || r₂)
-	//                       |- - - - - - - - - - - - - - - -|
-	//                               |- - - - - - - - - - - -|
-	//                                           d₂
-	//
-	// Note that in little-endian, the rightmost bits are the most significant
-	// bits (dropped with a mask) and the leftmost bits are the least
-	// significant bits (dropped with a right shift).
-
-	var a nttElement
-	var j int        // index into a
-	var buf [24]byte // buffered reads from B
-	off := len(buf)  // index into buf, starts in a "buffer fully consumed" state
-	for {
-		if off >= len(buf) {
-			B.Read(buf[:])
-			off = 0
-		}
-		d1 := binary.LittleEndian.Uint16(buf[off:]) & 0b1111_1111_1111
-		d2 := binary.LittleEndian.Uint16(buf[off+1:]) >> 4
-		off += 3
-		if d1 < q {
-			a[j] = fieldElement(d1)
-			j++
-		}
-		if j >= len(a) {
-			break
-		}
-		if d2 < q {
-			a[j] = fieldElement(d2)
-			j++
-		}
-		if j >= len(a) {
-			break
-		}
-	}
-	return a
-}
diff --git a/randutil/randutil.go b/randutil/randutil.go
new file mode 100644
index 0000000..70a9e96
--- /dev/null
+++ b/randutil/randutil.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package randutil contains internal randomness utilities for various
+// crypto packages.
+package randutil
+
+import (
+	"io"
+	"math/rand/v2"
+)
+
+// MaybeReadByte reads a single byte from r with 50% probability. This is used
+// to ensure that callers do not depend on non-guaranteed behaviour, e.g.
+// assuming that rsa.GenerateKey is deterministic w.r.t. a given random stream.
+//
+// This does not affect tests that pass a stream of fixed bytes as the random
+// source (e.g. a zeroReader).
+func MaybeReadByte(r io.Reader) {
+	if rand.Uint64()&1 == 1 {
+		return
+	}
+	var buf [1]byte
+	r.Read(buf[:])
+}
\ No newline at end of file
diff --git a/sha3/shake.go b/sha3/shake.go
new file mode 100644
index 0000000..d4329ea
--- /dev/null
+++ b/sha3/shake.go
@@ -0,0 +1,151 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha3
+
+import (
+	"bytes"
+	//"crypto/internal/fips140"
+	"github.com/xtls/reality/byteorder"
+	"errors"
+	"math/bits"
+)
+
+type SHAKE struct {
+	d Digest // SHA-3 state context and Read/Write operations
+
+	// initBlock is the cSHAKE specific initialization set of bytes. It is initialized
+	// by newCShake function and stores concatenation of N followed by S, encoded
+	// by the method specified in 3.3 of [1].
+	// It is stored here in order for Reset() to be able to put context into
+	// initial state.
+	initBlock []byte
+}
+
+func bytepad(data []byte, rate int) []byte {
+	out := make([]byte, 0, 9+len(data)+rate-1)
+	out = append(out, leftEncode(uint64(rate))...)
+	out = append(out, data...)
+	if padlen := rate - len(out)%rate; padlen < rate {
+		out = append(out, make([]byte, padlen)...)
+	}
+	return out
+}
+
+func leftEncode(x uint64) []byte {
+	// Let n be the smallest positive integer for which 2^(8n) > x.
+	n := (bits.Len64(x) + 7) / 8
+	if n == 0 {
+		n = 1
+	}
+	// Return n || x with n as a byte and x an n bytes in big-endian order.
+	b := make([]byte, 9)
+	byteorder.BEPutUint64(b[1:], x)
+	b = b[9-n-1:]
+	b[0] = byte(n)
+	return b
+}
+
+func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) *SHAKE {
+	c := &SHAKE{d: Digest{rate: rate, outputLen: outputLen, dsbyte: dsbyte}}
+	c.initBlock = make([]byte, 0, 9+len(N)+9+len(S)) // leftEncode returns max 9 bytes
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...)
+	c.initBlock = append(c.initBlock, N...)
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...)
+	c.initBlock = append(c.initBlock, S...)
+	c.Write(bytepad(c.initBlock, c.d.rate))
+	return c
+}
+
+func (s *SHAKE) BlockSize() int { return s.d.BlockSize() }
+func (s *SHAKE) Size() int      { return s.d.Size() }
+
+// Sum appends a portion of output to b and returns the resulting slice. The
+// output length is selected to provide full-strength generic security: 32 bytes
+// for SHAKE128 and 64 bytes for SHAKE256. It does not change the underlying
+// state. It panics if any output has already been read.
+func (s *SHAKE) Sum(in []byte) []byte { return s.d.Sum(in) }
+
+// Write absorbs more data into the hash's state.
+// It panics if any output has already been read.
+func (s *SHAKE) Write(p []byte) (n int, err error) { return s.d.Write(p) }
+
+func (s *SHAKE) Read(out []byte) (n int, err error) {
+	//fips140.RecordApproved()
+	// Note that read is not exposed on Digest since SHA-3 does not offer
+	// variable output length. It is only used internally by Sum.
+	return s.d.read(out)
+}
+
+// Reset resets the hash to initial state.
+func (s *SHAKE) Reset() {
+	s.d.Reset()
+	if len(s.initBlock) != 0 {
+		s.Write(bytepad(s.initBlock, s.d.rate))
+	}
+}
+
+// Clone returns a copy of the SHAKE context in its current state.
+func (s *SHAKE) Clone() *SHAKE {
+	ret := *s
+	return &ret
+}
+
+func (s *SHAKE) MarshalBinary() ([]byte, error) {
+	return s.AppendBinary(make([]byte, 0, marshaledSize+len(s.initBlock)))
+}
+
+func (s *SHAKE) AppendBinary(b []byte) ([]byte, error) {
+	b, err := s.d.AppendBinary(b)
+	if err != nil {
+		return nil, err
+	}
+	b = append(b, s.initBlock...)
+	return b, nil
+}
+
+func (s *SHAKE) UnmarshalBinary(b []byte) error {
+	if len(b) < marshaledSize {
+		return errors.New("sha3: invalid hash state")
+	}
+	if err := s.d.UnmarshalBinary(b[:marshaledSize]); err != nil {
+		return err
+	}
+	s.initBlock = bytes.Clone(b[marshaledSize:])
+	return nil
+}
+
+// NewShake128 creates a new SHAKE128 XOF.
+func NewShake128() *SHAKE {
+	return &SHAKE{d: Digest{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake}}
+}
+
+// NewShake256 creates a new SHAKE256 XOF.
+func NewShake256() *SHAKE {
+	return &SHAKE{d: Digest{rate: rateK512, outputLen: 64, dsbyte: dsbyteShake}}
+}
+
+// NewCShake128 creates a new cSHAKE128 XOF.
+//
+// N is used to define functions based on cSHAKE, it can be empty when plain
+// cSHAKE is desired. S is a customization byte string used for domain
+// separation. When N and S are both empty, this is equivalent to NewShake128.
+func NewCShake128(N, S []byte) *SHAKE {
+	if len(N) == 0 && len(S) == 0 {
+		return NewShake128()
+	}
+	return newCShake(N, S, rateK256, 32, dsbyteCShake)
+}
+
+// NewCShake256 creates a new cSHAKE256 XOF.
+//
+// N is used to define functions based on cSHAKE, it can be empty when plain
+// cSHAKE is desired. S is a customization byte string used for domain
+// separation. When N and S are both empty, this is equivalent to NewShake256.
+func NewCShake256(N, S []byte) *SHAKE {
+	if len(N) == 0 && len(S) == 0 {
+		return NewShake256()
+	}
+	return newCShake(N, S, rateK512, 64, dsbyteCShake)
+}
\ No newline at end of file