diff --git a/websocket_test.go b/websocket_test.go
index f4073bce64d7fbfba80c06b409720b13c45df76c..8d18c738bb73c1ca7d940aed309b50ba19269de9 100644
--- a/websocket_test.go
+++ b/websocket_test.go
@@ -776,6 +776,7 @@ func benchConn(b *testing.B, echo, stream bool, size int) {
 func BenchmarkConn(b *testing.B) {
 	sizes := []int{
 		2,
+		16,
 		32,
 		512,
 		4096,
diff --git a/xor.go b/xor.go
index 5a68e81d990b2782cf886658a93006a5e8087df6..a58a72f473199a023b8322fbecf8890fcf482176 100644
--- a/xor.go
+++ b/xor.go
@@ -13,10 +13,10 @@ import (
 // to be used for masking in the key. This is so that
 // unmasking can be performed without the entire frame.
 func fastXOR(key [4]byte, keyPos int, b []byte) int {
-	// If the payload is greater than 16 bytes, then it's worth
+	// If the payload is greater than or equal to 16 bytes, then it's worth
 	// masking 8 bytes at a time.
 	// Optimization from https://github.com/golang/go/issues/31586#issuecomment-485530859
-	if len(b) > 16 {
+	if len(b) >= 16 {
 		// We first create a key that is 8 bytes long
 		// and is aligned on the position correctly.
 		var alignedKey [8]byte
@@ -25,6 +25,86 @@ func fastXOR(key [4]byte, keyPos int, b []byte) int {
 		}
 		k := binary.LittleEndian.Uint64(alignedKey[:])
 
+		// Then we xor until b is less than 128 bytes.
+		for len(b) >= 128 {
+			v := binary.LittleEndian.Uint64(b)
+			binary.LittleEndian.PutUint64(b, v^k)
+			v = binary.LittleEndian.Uint64(b[8:])
+			binary.LittleEndian.PutUint64(b[8:], v^k)
+			v = binary.LittleEndian.Uint64(b[16:])
+			binary.LittleEndian.PutUint64(b[16:], v^k)
+			v = binary.LittleEndian.Uint64(b[24:])
+			binary.LittleEndian.PutUint64(b[24:], v^k)
+			v = binary.LittleEndian.Uint64(b[32:])
+			binary.LittleEndian.PutUint64(b[32:], v^k)
+			v = binary.LittleEndian.Uint64(b[40:])
+			binary.LittleEndian.PutUint64(b[40:], v^k)
+			v = binary.LittleEndian.Uint64(b[48:])
+			binary.LittleEndian.PutUint64(b[48:], v^k)
+			v = binary.LittleEndian.Uint64(b[56:])
+			binary.LittleEndian.PutUint64(b[56:], v^k)
+			v = binary.LittleEndian.Uint64(b[64:])
+			binary.LittleEndian.PutUint64(b[64:], v^k)
+			v = binary.LittleEndian.Uint64(b[72:])
+			binary.LittleEndian.PutUint64(b[72:], v^k)
+			v = binary.LittleEndian.Uint64(b[80:])
+			binary.LittleEndian.PutUint64(b[80:], v^k)
+			v = binary.LittleEndian.Uint64(b[88:])
+			binary.LittleEndian.PutUint64(b[88:], v^k)
+			v = binary.LittleEndian.Uint64(b[96:])
+			binary.LittleEndian.PutUint64(b[96:], v^k)
+			v = binary.LittleEndian.Uint64(b[104:])
+			binary.LittleEndian.PutUint64(b[104:], v^k)
+			v = binary.LittleEndian.Uint64(b[112:])
+			binary.LittleEndian.PutUint64(b[112:], v^k)
+			v = binary.LittleEndian.Uint64(b[120:])
+			binary.LittleEndian.PutUint64(b[120:], v^k)
+			b = b[128:]
+		}
+
+		// Then we xor until b is less than 64 bytes.
+		for len(b) >= 64 {
+			v := binary.LittleEndian.Uint64(b)
+			binary.LittleEndian.PutUint64(b, v^k)
+			v = binary.LittleEndian.Uint64(b[8:])
+			binary.LittleEndian.PutUint64(b[8:], v^k)
+			v = binary.LittleEndian.Uint64(b[16:])
+			binary.LittleEndian.PutUint64(b[16:], v^k)
+			v = binary.LittleEndian.Uint64(b[24:])
+			binary.LittleEndian.PutUint64(b[24:], v^k)
+			v = binary.LittleEndian.Uint64(b[32:])
+			binary.LittleEndian.PutUint64(b[32:], v^k)
+			v = binary.LittleEndian.Uint64(b[40:])
+			binary.LittleEndian.PutUint64(b[40:], v^k)
+			v = binary.LittleEndian.Uint64(b[48:])
+			binary.LittleEndian.PutUint64(b[48:], v^k)
+			v = binary.LittleEndian.Uint64(b[56:])
+			binary.LittleEndian.PutUint64(b[56:], v^k)
+			b = b[64:]
+		}
+
+		// Then we xor until b is less than 32 bytes.
+		for len(b) >= 32 {
+			v := binary.LittleEndian.Uint64(b)
+			binary.LittleEndian.PutUint64(b, v^k)
+			v = binary.LittleEndian.Uint64(b[8:])
+			binary.LittleEndian.PutUint64(b[8:], v^k)
+			v = binary.LittleEndian.Uint64(b[16:])
+			binary.LittleEndian.PutUint64(b[16:], v^k)
+			v = binary.LittleEndian.Uint64(b[24:])
+			binary.LittleEndian.PutUint64(b[24:], v^k)
+			b = b[32:]
+		}
+
+		// Then we xor until b is less than 16 bytes.
+		for len(b) >= 16 {
+			v := binary.LittleEndian.Uint64(b)
+			binary.LittleEndian.PutUint64(b, v^k)
+			v = binary.LittleEndian.Uint64(b[8:])
+			binary.LittleEndian.PutUint64(b[8:], v^k)
+			b = b[16:]
+		}
+
 		// Then we xor until b is less than 8 bytes.
 		for len(b) >= 8 {
 			v := binary.LittleEndian.Uint64(b)
diff --git a/xor_test.go b/xor_test.go
index c3adaf580a499bb02891ac6da38143d19013ce3e..634af606ac0184ef6ba244a7fb168f2e7403d1ca 100644
--- a/xor_test.go
+++ b/xor_test.go
@@ -36,6 +36,7 @@ func basixXOR(maskKey [4]byte, pos int, b []byte) int {
 func BenchmarkXOR(b *testing.B) {
 	sizes := []int{
 		2,
+		16,
 		32,
 		512,
 		4096,