Post-quantum cryptography introduces new performance characteristics compared to classical algorithms. This guide covers optimization techniques for Kyber and SPHINCS+ implementations, helping you achieve production-ready performance. The SynX quantum-resistant wallet uses these techniques extensively.
Performance Baseline
Understanding baseline performance helps identify optimization opportunities:
Kyber-768 Performance (Intel i7-12700, single thread)
Key Generation
~25 μs (40,000 ops/sec)
Encapsulation
~30 μs (33,000 ops/sec)
Decapsulation
~28 μs (36,000 ops/sec)
SPHINCS+-128s Performance (Intel i7-12700, single thread)
Key Generation
~1.5 ms (650 ops/sec)
Signing
~50-80 ms (12-20 ops/sec)
Verification
~2 ms (500 ops/sec)
Algorithm Selection Optimization
Choose the right variant for your use case:
| Algorithm |
Use Case |
Trade-off |
| SPHINCS+-128s |
Size-constrained (wallets) |
Slower signing, smaller signatures |
| SPHINCS+-128f |
Speed-critical (servers) |
Faster signing, 2x larger signatures |
| Kyber-512 |
Resource-constrained |
Lower security margin |
| Kyber-768 |
Standard (recommended) |
Best balance |
| Kyber-1024 |
Maximum security |
~30% slower than 768 |
SynX Choice: The SynX quantum-resistant wallet uses SPHINCS+-128s for user transactions (infrequent signing, size matters) and SPHINCS+-128f for validator operations (frequent signing, bandwidth available).
Parallelization Strategies
Parallel Signature Generation
import concurrent.futures
import oqs
from typing import List, Tuple
import time
class ParallelSigner:
"""
Parallel SPHINCS+ signing for batch operations
Use when signing multiple independent messages.
"""
def __init__(self, max_workers: int = None):
"""
Initialize parallel signer
Args:
max_workers: CPU threads to use (default: CPU count)
"""
self.max_workers = max_workers or os.cpu_count()
def sign_batch(
self,
messages: List[bytes],
secret_key: bytes
) -> List[bytes]:
"""
Sign multiple messages in parallel
Args:
messages: List of messages to sign
secret_key: SPHINCS+ secret key
Returns:
List of signatures in same order as messages
"""
def sign_single(message: bytes) -> bytes:
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple", secret_key)
return sig.sign(message)
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.max_workers
) as executor:
signatures = list(executor.map(sign_single, messages))
return signatures
def sign_with_keys(
self,
items: List[Tuple[bytes, bytes]]
) -> List[bytes]:
"""Sign messages with different keys in parallel"""
def sign_item(item: Tuple[bytes, bytes]) -> bytes:
message, sk = item
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple", sk)
return sig.sign(message)
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.max_workers
) as executor:
return list(executor.map(sign_item, items))
def benchmark_parallel_vs_sequential():
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple")
sig.generate_keypair()
sk = sig.export_secret_key()
messages = [f"Message {i}".encode() for i in range(16)]
start = time.perf_counter()
sequential_sigs = []
for msg in messages:
s = oqs.Signature("SPHINCS+-SHAKE-128s-simple", sk)
sequential_sigs.append(s.sign(msg))
seq_time = time.perf_counter() - start
signer = ParallelSigner()
start = time.perf_counter()
parallel_sigs = signer.sign_batch(messages, sk)
par_time = time.perf_counter() - start
print(f"Sequential: {seq_time:.2f}s ({len(messages)/seq_time:.1f} msg/s)")
print(f"Parallel: {par_time:.2f}s ({len(messages)/par_time:.1f} msg/s)")
print(f"Speedup: {seq_time/par_time:.2f}x")
Parallel Verification
class ParallelVerifier:
"""Parallel signature verification for validators"""
def __init__(self, max_workers: int = None):
self.max_workers = max_workers or os.cpu_count()
def verify_batch(
self,
items: List[Tuple[bytes, bytes, bytes]]
) -> List[bool]:
"""
Verify multiple signatures in parallel
Returns list of verification results
"""
def verify_single(item: Tuple[bytes, bytes, bytes]) -> bool:
message, signature, public_key = item
try:
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple")
return sig.verify(message, signature, public_key)
except:
return False
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.max_workers
) as executor:
return list(executor.map(verify_single, items))
def all_valid(
self,
items: List[Tuple[bytes, bytes, bytes]]
) -> bool:
"""Quick check if all signatures valid"""
results = self.verify_batch(items)
return all(results)
async def validate_block_transactions(transactions: List):
verifier = ParallelVerifier(max_workers=8)
items = [
(tx.signing_message, tx.signature, tx.public_key)
for tx in transactions
]
results = verifier.verify_batch(items)
valid_txs = [tx for tx, valid in zip(transactions, results) if valid]
return valid_txs
Caching Strategies
Key Caching
from functools import lru_cache
import hashlib
class KeyCache:
"""
Cache derived keys to avoid repeated derivation
Useful for HD wallets where same paths are accessed frequently.
"""
def __init__(self, max_size: int = 1000):
self.max_size = max_size
self._cache: dict = {}
def get_or_derive(
self,
master_seed: bytes,
path: str,
derive_func
) -> Tuple[bytes, bytes]:
"""
Get cached key or derive and cache
Args:
master_seed: Wallet master seed
path: Derivation path
derive_func: Function to call if cache miss
Returns:
(public_key, secret_key) tuple
"""
cache_key = hashlib.blake2b(master_seed + path.encode()).hexdigest()[:32]
if cache_key in self._cache:
return self._cache[cache_key]
pk, sk = derive_func(master_seed, path)
if len(self._cache) >= self.max_size:
oldest = next(iter(self._cache))
del self._cache[oldest]
self._cache[cache_key] = (pk, sk)
return pk, sk
def clear(self):
"""Clear all cached keys (call on wallet lock)"""
for key in list(self._cache.keys()):
pk, sk = self._cache[key]
self._cache[key] = (b'\x00' * len(pk), b'\x00' * len(sk))
del self._cache[key]
class OptimizedWallet:
def __init__(self, master_seed: bytes):
self.master_seed = master_seed
self.key_cache = KeyCache(max_size=500)
def get_address_keys(self, path: str) -> Tuple[bytes, bytes]:
return self.key_cache.get_or_derive(
self.master_seed,
path,
self._derive_keys
)
def _derive_keys(self, seed: bytes, path: str):
...
Verification Result Caching
class SignatureCache:
"""
Cache signature verification results
For validators to avoid re-verifying seen transactions.
"""
def __init__(self, max_size: int = 10000):
self.max_size = max_size
self._verified: dict[str, bool] = {}
def _signature_id(
self,
message: bytes,
signature: bytes,
public_key: bytes
) -> str:
"""Create unique ID for signature verification"""
return hashlib.blake2b(
message + signature[:64] + public_key,
digest_size=16
).hexdigest()
def check_or_verify(
self,
message: bytes,
signature: bytes,
public_key: bytes
) -> bool:
"""Check cache or verify and cache result"""
sig_id = self._signature_id(message, signature, public_key)
if sig_id in self._verified:
return self._verified[sig_id]
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple")
is_valid = sig.verify(message, signature, public_key)
if len(self._verified) >= self.max_size:
to_remove = list(self._verified.keys())[:self.max_size // 10]
for key in to_remove:
del self._verified[key]
self._verified[sig_id] = is_valid
return is_valid
Memory Optimization
import gc
class MemoryEfficientSigner:
"""
Memory-efficient signing for embedded/mobile devices
"""
def sign_and_release(
self,
message: bytes,
secret_key: bytes
) -> bytes:
"""
Sign message and immediately release key memory
Use for one-off signatures where key shouldn't persist.
"""
sig_obj = oqs.Signature("SPHINCS+-SHAKE-128s-simple", secret_key)
signature = sig_obj.sign(message)
del sig_obj
if isinstance(secret_key, bytearray):
for i in range(len(secret_key)):
secret_key[i] = 0
gc.collect()
return signature
def streaming_sign(
self,
message_chunks: Iterator[bytes],
secret_key: bytes
) -> bytes:
"""
Sign streaming message without loading all into memory
Pre-hash the message in chunks, then sign the hash.
"""
hasher = hashlib.blake2b(digest_size=32)
for chunk in message_chunks:
hasher.update(chunk)
message_hash = hasher.digest()
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple", secret_key)
return sig.sign(message_hash)
Hardware Acceleration
AVX2/AVX-512 Optimization
Most PQC libraries have optimized assembly for x86_64:
import subprocess
def get_cpu_features() -> set:
"""Detect available CPU SIMD features"""
try:
with open("/proc/cpuinfo") as f:
cpuinfo = f.read()
features = set()
if "avx2" in cpuinfo:
features.add("avx2")
if "avx512" in cpuinfo:
features.add("avx512")
if "aes" in cpuinfo:
features.add("aesni")
return features
except:
return set()
def select_optimal_variant() -> str:
"""Select best SPHINCS+ variant for this CPU"""
features = get_cpu_features()
if "avx512" in features:
print("Using AVX-512 optimized implementation")
return "SPHINCS+-SHAKE-128s-simple"
elif "avx2" in features:
print("Using AVX2 optimized implementation")
return "SPHINCS+-SHAKE-128s-simple"
else:
print("Using reference implementation")
return "SPHINCS+-SHAKE-128s-simple"
Performance Comparison by Platform
| Platform |
SPHINCS+ Sign |
Kyber Encap |
Notes |
| x86_64 + AVX2 |
~50ms |
~25μs |
Reference performance |
| x86_64 + AVX-512 |
~35ms |
~18μs |
~30% faster |
| ARM64 (Apple M1) |
~45ms |
~20μs |
NEON optimized |
| ARM Cortex-A72 |
~120ms |
~80μs |
Raspberry Pi 4 |
| WASM (Browser) |
~500ms |
~150μs |
No SIMD |
Benchmarking Your Implementation
import statistics
import time
class CryptoBenchmark:
"""Comprehensive PQC benchmarking"""
def __init__(self, iterations: int = 100):
self.iterations = iterations
def benchmark_operation(
self,
name: str,
operation,
setup=None
) -> dict:
"""Benchmark a single operation"""
times = []
for _ in range(self.iterations):
if setup:
ctx = setup()
start = time.perf_counter()
if setup:
operation(ctx)
else:
operation()
elapsed = (time.perf_counter() - start) * 1000
times.append(elapsed)
return {
"name": name,
"mean": statistics.mean(times),
"median": statistics.median(times),
"stdev": statistics.stdev(times),
"min": min(times),
"max": max(times),
"ops_per_sec": 1000 / statistics.mean(times)
}
def run_full_benchmark(self) -> dict:
"""Run complete PQC benchmark suite"""
results = {}
results["kyber_keygen"] = self.benchmark_operation(
"Kyber-768 KeyGen",
lambda: oqs.KeyEncapsulation("Kyber768").generate_keypair()
)
kem = oqs.KeyEncapsulation("Kyber768")
pk = kem.generate_keypair()
sk = kem.export_secret_key()
results["kyber_encap"] = self.benchmark_operation(
"Kyber-768 Encap",
lambda: kem.encap_secret(pk)
)
ct, _ = kem.encap_secret(pk)
kem_dec = oqs.KeyEncapsulation("Kyber768", sk)
results["kyber_decap"] = self.benchmark_operation(
"Kyber-768 Decap",
lambda: kem_dec.decap_secret(ct)
)
results["sphincs_keygen"] = self.benchmark_operation(
"SPHINCS+-128s KeyGen",
lambda: oqs.Signature("SPHINCS+-SHAKE-128s-simple").generate_keypair()
)
sig = oqs.Signature("SPHINCS+-SHAKE-128s-simple")
sig.generate_keypair()
spx_sk = sig.export_secret_key()
msg = b"x" * 256
results["sphincs_sign"] = self.benchmark_operation(
"SPHINCS+-128s Sign",
lambda: oqs.Signature("SPHINCS+-SHAKE-128s-simple", spx_sk).sign(msg),
iterations=20
)
return results
def print_results(self, results: dict):
"""Pretty print benchmark results"""
print("\n=== PQC Performance Benchmark ===")
print(f"Iterations: {self.iterations}\n")
for key, data in results.items():
print(f"{data['name']}:")
print(f" Mean: {data['mean']:.3f}ms")
print(f" Median: {data['median']:.3f}ms")
print(f" Ops/sec: {data['ops_per_sec']:.1f}")
print()
if __name__ == "__main__":
bench = CryptoBenchmark(iterations=50)
results = bench.run_full_benchmark()
bench.print_results(results)
Frequently Asked Questions
How can I speed up SPHINCS+ signing?
Use SPHINCS+-128f instead of SPHINCS+-128s for 3-5x faster signing at the cost of 2x larger signatures. For batch operations, parallelize independent signatures. Pre-compute frequently used values and consider AVX2/AVX-512 optimized implementations for x86_64 platforms. The SynX quantum-resistant wallet uses parallel signing for transaction batches.
What is the typical performance difference between Kyber and ECDH?
Kyber-768 key generation is roughly 2-3x slower than secp256k1. Encapsulation/decapsulation is comparable or slightly slower. The main overhead is in key/ciphertext size (1KB+ vs 32-64 bytes), not computation time. Modern CPUs with AVX2 can perform 10,000+ Kyber operations per second.
Optimization vs. Security
Never sacrifice security for performance. All optimizations in the SynX quantum-resistant wallet are thoroughly reviewed to ensure no side-channel leaks or security weaknesses are introduced.