"""
Advanced tensor core with electron-speed operations, quantum acceleration,
and persistent DB-backed caching for optimal tensor operations.
"""

import numpy as np
import time
import logging
import hashlib
import threading
import json
import sqlite3
import asyncio
from typing import Dict, Optional, Tuple, List, Union
from concurrent.futures import ThreadPoolExecutor
from local_storage_manager import LocalStorageManager
from tensor_db_cache import TensorDBCache

from tensor_output_formatter import TensorOutputFormatter

class UnitDistributor:
    """Manages distribution and tracking of tensor chunks across units within a core with electron-speed scaling"""
    
    def __init__(self, core_id: int, num_units: int = 30):
        self.core_id = core_id
        self.num_units = num_units
        
        # Initialize base speeds and frequencies
        self.base_speed = 8.92e85  # Base computational speed in Hz
        self.core_speed = self.base_speed + (core_id * self.base_speed)  # Scale with core ID
        self.electron_drift_base = 8.96e7  # Base electron drift velocity
        self.process_node = 14e-9  # 14nm process node
        
        # Create units with scaled speeds
        self.units = []
        for i in range(num_units):
            unit = TensorUnit(i, core_id)
            # Scale unit speeds
            unit.switching_frequency = self.base_speed + (i * self.base_speed)  # Each unit faster
            unit.electron_drift_velocity = self.electron_drift_base * (i + 1)
            unit.ops_per_cycle = int(unit.switching_frequency)  # Max throughput
            self.units.append(unit)
            
        self.output_formatter = TensorOutputFormatter()
        
        # Enhanced unit performance tracking
        self.unit_stats = {i: {
            'total_ops': 0,
            'cache_hits': 0,
            'processing_time': 0,
            'chunks_processed': 0,
            'last_utilization': 0,  # Tracks recent unit utilization
            'switching_frequency': self.units[i].switching_frequency,
            'electron_drift': self.units[i].electron_drift_velocity,
            'ops_per_cycle': self.units[i].ops_per_cycle
        } for i in range(num_units)}
        
    async def distribute_chunks(self, chunks: List[np.ndarray], operation: str) -> Dict:
        """Distribute chunks across units with workload balancing"""
        start_time = time.time()
        total_cache_hits = 0
        total_ops = 0
        all_results = []
        
        try:
            # Calculate unit assignments based on performance history
            assignments = self._balance_workload(chunks)
            
            # Process chunks on assigned units
            unit_tasks = []
            for unit_id, unit_chunks in assignments.items():
                task = asyncio.create_task(
                    self._process_unit_chunks(unit_id, unit_chunks, operation)
                )
                unit_tasks.append(task)
            
            # Wait for all units to complete
            unit_results = await asyncio.gather(*unit_tasks)
            
            # Aggregate results and stats
            for result in unit_results:
                total_cache_hits += result['cache_hits']
                total_ops += result['ops']
                all_results.extend(result['results'])
                
                # Update unit stats
                unit_id = result['unit_id']
                self.unit_stats[unit_id].update({
                    'total_ops': self.unit_stats[unit_id]['total_ops'] + result['ops'],
                    'cache_hits': self.unit_stats[unit_id]['cache_hits'] + result['cache_hits'],
                    'processing_time': self.unit_stats[unit_id]['processing_time'] + result['processing_time'],
                    'chunks_processed': self.unit_stats[unit_id]['chunks_processed'] + len(result['results']),
                    'last_utilization': result['processing_time'] / (time.time() - start_time)
                })
            
            return {
                'results': all_results,
                'total_ops': total_ops,
                'cache_hits': total_cache_hits,
                'processing_time': time.time() - start_time,
                'unit_stats': self.unit_stats
            }
            
        except Exception as e:
            logging.error(f"Error in unit distribution for core {self.core_id}: {str(e)}")
            raise
            
    def _balance_workload(self, chunks: List[np.ndarray]) -> Dict[int, List[np.ndarray]]:
        """Balance chunk distribution based on unit performance and electron-speed capabilities"""
        assignments = {}
        
        # Calculate unit weights based on both performance history and electron speed
        weights = self._calculate_unit_weights()
        total_weight = sum(weights.values())
        
        # Factor in electron-speed capabilities
        electron_weights = {}
        for unit_id in range(self.num_units):
            unit = self.units[unit_id]
            # Calculate theoretical max ops based on electron physics
            max_ops = (unit.switching_frequency * unit.electron_drift_velocity) / self.process_node
            electron_weights[unit_id] = max_ops
            
        # Combine historical performance with electron capabilities
        combined_weights = {}
        for unit_id in weights:
            combined_weights[unit_id] = weights[unit_id] * electron_weights[unit_id]
            
        # Distribute chunks proportionally to combined weights
        remaining_chunks = chunks.copy()
        for unit_id, weight in sorted(combined_weights.items(), key=lambda x: x[1], reverse=True):
            if not remaining_chunks:
                break
                
            # Calculate number of chunks for this unit
            num_chunks = max(
                1,
                int((weight / total_weight) * len(chunks))
            ) if unit_id < self.num_units - 1 else len(remaining_chunks)
            
            # Assign chunks
            assignments[unit_id] = remaining_chunks[:num_chunks]
            remaining_chunks = remaining_chunks[num_chunks:]
            
        return assignments
        
    def _calculate_unit_weights(self) -> Dict[int, float]:
        """Calculate processing weights for each unit based on performance"""
        weights = {}
        
        for unit_id in range(self.num_units):
            stats = self.unit_stats[unit_id]
            
            # Calculate performance score based on:
            # - Processing speed (ops/second)
            # - Cache hit rate
            # - Recent utilization
            if stats['processing_time'] > 0:
                ops_per_second = stats['total_ops'] / stats['processing_time']
                cache_hit_rate = stats['cache_hits'] / max(1, stats['chunks_processed'])
                
                # Combine factors with empirically tuned weights
                weight = (
                    0.5 * ops_per_second +
                    0.3 * cache_hit_rate +
                    0.2 * (1 - stats['last_utilization'])  # Prefer less utilized units
                )
            else:
                weight = 1.0  # Default weight for new units
                
            weights[unit_id] = max(0.1, weight)  # Ensure minimum weight
            
        return weights
        
    async def _process_unit_chunks(self, unit_id: int, chunks: List[np.ndarray], operation: str) -> Dict:
        """Process chunks on a specific unit at electron-speed throughput"""
        unit = self.units[unit_id]
        start_time = time.time()
        
        # Calculate electron-speed operations for this cycle
        cycle_time = time.time() - unit.last_cycle_time
        max_ops = int(unit.switching_frequency * cycle_time)
        unit.last_cycle_time = time.time()
        
        results = []
        cache_hits = 0
        total_ops = 0
        
        # Calculate quantum-accelerated chunk timing
        chunk_timing = self.process_node / unit.electron_drift_velocity  # Time per operation
        quantum_speedup = unit.switching_frequency / self.base_speed
        
        for chunk in chunks:
            # Apply electron-speed acceleration
            chunk_size = np.prod(chunk.shape)
            ops_needed = chunk_size * quantum_speedup
            
            if ops_needed <= max_ops:
                # Process at full electron speed
                result = unit.process_tensor(chunk, operation)
                total_ops += ops_needed
            else:
                # Split for optimal electron throughput
                sub_chunks = np.array_split(chunk, int(np.ceil(ops_needed / max_ops)))
                sub_results = []
                for sub_chunk in sub_chunks:
                    sub_result = unit.process_tensor(sub_chunk, operation)
                    sub_results.append(sub_result)
                    total_ops += max_ops
                result = np.concatenate(sub_results)
                
            results.append(result)
            
            # Update metrics
            if result['metadata'].get('cache_hit', False):
                cache_hits += 1
            total_ops += result['operations']
            
        # Aggregate results based on operation type
        final_result = self._aggregate_unit_results(results, operation)
        
        # Update unit statistics
        processing_time = time.time() - start_time
        self.unit_stats[unit_id].update({
            'total_ops': self.unit_stats[unit_id]['total_ops'] + total_ops,
            'cache_hits': self.unit_stats[unit_id]['cache_hits'] + cache_hits,
            'processing_time': self.unit_stats[unit_id]['processing_time'] + processing_time,
            'chunks_processed': self.unit_stats[unit_id].get('chunks_processed', 0) + len(chunks),
            'last_utilization': processing_time / max(0.001, time.time() - start_time)
        })
        
        return final_result
        
    def _aggregate_unit_results(self, results: List[Dict], operation: str) -> Dict:
        """Aggregate results from multiple chunks based on operation type"""
        if not results:
            return {'result': None, 'metadata': {'error': 'No results to aggregate'}}
            
        # Extract operation type from first result
        op_type = results[0]['metadata']['output_type']
        
        if op_type == 'attention_qkv':
            # Combine Q,K,V matrices
            combined = {
                'query': np.concatenate([r['result']['query'] for r in results], axis=1),
                'key': np.concatenate([r['result']['key'] for r in results], axis=1),
                'value': np.concatenate([r['result']['value'] for r in results], axis=1)
            }
            return {
                'result': combined,
                'metadata': {
                    'output_type': 'attention_qkv',
                    'shapes': {k: v.shape for k, v in combined.items()}
                }
            }
            
        elif op_type == 'attention_scores':
            # Combine attention scores with proper scaling
            scores = np.concatenate([r['result'] for r in results], axis=-1)
            return {
                'result': scores,
                'metadata': {
                    'output_type': 'attention_scores',
                    'shape': scores.shape
                }
            }
            
        elif op_type == 'probability':
            # Combine and renormalize probabilities
            probs = np.concatenate([r['result'] for r in results], axis=-1)
            probs = probs / np.sum(probs, axis=-1, keepdims=True)  # Renormalize
            return {
                'result': probs,
                'metadata': {
                    'output_type': 'probability',
                    'shape': probs.shape,
                    'sum': float(np.sum(probs))
                }
            }
            
        elif op_type == 'transformer':
            # Combine transformer outputs with layer normalization
            combined = np.concatenate([r['result'] for r in results], axis=1)
            # Re-normalize the combined output
            mean = np.mean(combined, axis=-1, keepdims=True)
            std = np.std(combined, axis=-1, keepdims=True)
            normalized = (combined - mean) / (std + 1e-6)
            return {
                'result': normalized,
                'metadata': {
                    'output_type': 'transformer',
                    'shape': normalized.shape,
                    'layer_norm_stats': {
                        'mean': float(np.mean(normalized)),
                        'std': float(np.std(normalized))
                    }
                }
            }
            
        else:
            # Default concatenation for other operations
            combined = np.concatenate([r['result'] for r in results], axis=0)
            return {
                'result': combined,
                'metadata': {
                    'output_type': op_type,
                    'shape': combined.shape
                }
            }
            
            # Check if was cache hit
            chunk_id = unit._generate_chunk_id(chunk, operation)
            if db_cache.chunk_exists(chunk_id):
                cache_hits += 1
                
            total_ops += unit.total_ops
            
        return {
            'unit_id': unit_id,
            'results': results,
            'ops': total_ops,
            'cache_hits': cache_hits,
            'processing_time': time.time() - start_time
        }
        
    def cleanup(self):
        """Cleanup all units in this distributor"""
        for unit in self.units:
            try:
                # Reset unit state and cache entries
                unit._reset_state()
            except Exception as e:
                logging.error(f"Error cleaning up unit {unit.unit_id} in core {self.core_id}: {str(e)}")

class CoreDistributor:
    """Manages hierarchical tensor distribution across cores and units"""
    
    def __init__(self, num_cores: int = 8):
        self.num_cores = num_cores
        self.cores = [TensorCore(i) for i in range(num_cores)]
        self.total_ops = 0
        self.best_result = None
        self.start_time = None
        
        # Initialize distribution stats
        self.distribution_stats = {
            'total_chunks': 0,
            'chunks_per_core': {},
            'cache_hits': {},
            'processing_times': {}
        }
        
    async def distribute_tensor(self, tensor: np.ndarray, operation: str) -> Dict:
        """Distribute tensor processing across cores with hierarchical management"""
        self.start_time = time.time()
        total_chunks = 0
        all_results = []
        
        try:
            # Calculate global optimal distribution
            chunks_per_core = self._calculate_core_distribution(tensor)
            
            # Create tasks for each core
            core_tasks = []
            for core_id, chunks in chunks_per_core.items():
                task = asyncio.create_task(
                    self.cores[core_id].process_parallel(chunks, operation)
                )
                core_tasks.append(task)
                total_chunks += len(chunks)
            
            # Wait for all cores to complete
            core_results = await asyncio.gather(*core_tasks)
            
            # Process results
            for core_id, result in enumerate(core_results):
                self.total_ops += result['total_ops']
                self.distribution_stats['chunks_per_core'][core_id] = result['chunks_processed']
                self.distribution_stats['cache_hits'][core_id] = result.get('cache_hits', 0)
                self.distribution_stats['processing_times'][core_id] = result['processing_time']
                
                # Collect results
                all_results.extend([unit['result'] for unit in result['unit_results']])
            
            # Update final stats
            self.distribution_stats['total_chunks'] = total_chunks
            
            # Combine results based on operation type
            final_result = self._combine_results(all_results, operation)
            
            return {
                'result': final_result,
                'stats': self.get_distribution_stats()
            }
            
        except Exception as e:
            logging.error(f"Error in tensor distribution: {str(e)}")
            raise
            
    def _calculate_core_distribution(self, tensor: np.ndarray) -> Dict[int, List[np.ndarray]]:
        """Calculate optimal distribution of tensor chunks across cores"""
        distributions = {}
        
        try:
            # Get core cache stats
            core_stats = {}
            for core in self.cores:
                with sqlite3.connect(db_cache.types_db_path) as conn:
                    cursor = conn.cursor()
                    cursor.execute('''
                        SELECT active_chunks, total_memory, used_memory
                        FROM core_assignments
                        WHERE core_id = ?
                    ''', (core.core_id,))
                    stats = cursor.fetchone()
                    if stats:
                        core_stats[core.core_id] = {
                            'active_chunks': stats[0],
                            'total_memory': stats[1],
                            'used_memory': stats[2]
                        }
            
            # Calculate weighted distribution based on core availability
            total_memory = sum(stats['total_memory'] - stats['used_memory'] 
                             for stats in core_stats.values())
            
            for core_id, stats in core_stats.items():
                # Calculate core's share based on available memory
                available_memory = stats['total_memory'] - stats['used_memory']
                core_share = available_memory / total_memory if total_memory > 0 else 1/len(self.cores)
                
                # Calculate number of chunks for this core
                core_tensor_size = int(tensor.shape[0] * core_share)
                if core_id == len(self.cores) - 1:  # Last core gets remainder
                    core_chunks = tensor[sum(len(d) for d in distributions.values()):]
                else:
                    start_idx = sum(len(d) for d in distributions.values())
                    core_chunks = tensor[start_idx:start_idx + core_tensor_size]
                
                if len(core_chunks) > 0:
                    distributions[core_id] = core_chunks
            
            return distributions
            
        except Exception as e:
            logging.error(f"Error calculating core distribution: {str(e)}")
            # Fall back to simple even distribution
            chunk_size = len(tensor) // len(self.cores)
            return {i: chunk for i, chunk in enumerate(np.array_split(tensor, len(self.cores)))}
            
    def _combine_results(self, results: List[np.ndarray], operation: str) -> np.ndarray:
        """Combine results from all cores based on operation type"""
        if not results:
            return None
            
        if operation == "text_generate":
            return results[-1]  # Take last result for text generation
        else:
            return np.concatenate(results)
            
    def get_distribution_stats(self) -> Dict:
        """Get current distribution statistics"""
        stats = self.distribution_stats.copy()
        stats['total_time'] = time.time() - self.start_time if self.start_time else 0
        stats['total_ops'] = self.total_ops
        
        # Calculate efficiency metrics
        if stats['total_time'] > 0:
            stats['ops_per_second'] = self.total_ops / stats['total_time']
            stats['chunks_per_second'] = stats['total_chunks'] / stats['total_time']
            
        return stats
        
    def cleanup(self):
        """Clean up all cores"""
        for core in self.cores:
            core.cleanup()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('tensor_performance.log'),
        logging.StreamHandler()
    ]
)

# Initialize global DB cache
db_cache = TensorDBCache()

class TensorUnit:
    """Individual tensor processing unit operating at electron speed with quantum acceleration"""
    
    # Shared DB connection per core
    _core_db_connections = {}  # Maps core_id to shared SQLite connection
    _core_db_locks = {}       # Thread safety for DB access
    
    def __init__(self, unit_id: int, core_id: int):
        self.unit_id = unit_id
        self.core_id = core_id
        
        # Set up shared DB connection for this core
        if core_id not in self._core_db_connections:
            db_path = f"tensor_core_{core_id}_cache.db"
            self._core_db_connections[core_id] = sqlite3.connect(db_path, check_same_thread=False)
            self._core_db_locks[core_id] = threading.Lock()
            
            # Initialize core-level tables if needed
            with self._core_db_locks[core_id]:
                cursor = self._core_db_connections[core_id].cursor()
                cursor.execute("""
                    CREATE TABLE IF NOT EXISTS unit_cache (
                        unit_id INTEGER,
                        chunk_id TEXT,
                        data BLOB,
                        electron_cycles INTEGER,
                        quantum_state REAL,
                        last_access REAL,
                        PRIMARY KEY (unit_id, chunk_id)
                    )
                """)
                self._core_db_connections[core_id].commit()
        
        # Performance tracking
        self.total_ops = 0
        self.cache_hits = 0
        self.processing_time = 0
        self.chunks_processed = 0
        self.last_utilization = 0
        self.recent_performance = 1.0  # Performance score for load balancing
        
        # Advanced electron physics parameters
        self.base_speed = 8.92e85  # Base computational speed
        self.electron_drift_velocity = 1.96e7 * (unit_id + 1)  # Scaled with unit ID
        self.switching_frequency = self.base_speed + (unit_id * self.base_speed)  # Scaled frequency
        self.path_length = 14e-9  # 14nm process node
        self.last_cycle_time = time.time()
        
        # Quantum acceleration parameters
        self.ops_per_cycle = int(self.switching_frequency)  # Maximum operations per cycle
        self.quantum_speedup = self.switching_frequency / self.base_speed
        
        # Operation timing
        self.cycle_time = self.path_length / self.electron_drift_velocity
        self.max_throughput = self.switching_frequency * self.quantum_speedup
        
        # Operation scaling
        ops_per_second = 9.98e15  # Based on electron transit
        self.ops_per_cycle = int(ops_per_second / 1000)
        self.last_cycle_time = time.time()
        
        # Performance metrics
        self.best_result = None
        self.last_batch_size = 0
        self.peak_ops_per_second = 0
        self.avg_cache_hit_rate = 0.0
        
        # Enhanced cache configuration with shared core storage
        self.cache_prefix = f"core{core_id}_unit{unit_id}"
        self.cache_stats = {
            'hits': 0,
            'misses': 0,
            'stored_chunks': 0,
            'evicted_chunks': 0,
            'last_hit_time': 0,
            'quantum_speedups': 0,
            'electron_accelerations': 0
        }
        
        # Ensure unit exists in shared core DB
        self._verify_unit_in_core_db()
        
    def _verify_unit_in_core_db(self):
        """Verify unit exists in shared core DB cache table"""
        with self._core_db_locks[self.core_id]:
            cursor = self._core_db_connections[self.core_id].cursor()
            
            # Add unit performance tracking if not exists
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS unit_performance (
                    unit_id INTEGER PRIMARY KEY,
                    total_ops INTEGER,
                    cache_hits INTEGER,
                    quantum_boost REAL,
                    last_update REAL
                )
            """)
            
            # Initialize or update unit performance entry
            cursor.execute("""
                INSERT OR IGNORE INTO unit_performance 
                (unit_id, total_ops, cache_hits, quantum_boost, last_update)
                VALUES (?, 0, 0, 1.0, ?)
            """, (self.unit_id, time.time()))
            
            self._core_db_connections[self.core_id].commit()
        
    def _generate_chunk_id(self, tensor: np.ndarray, operation: str) -> str:
        """Generate a unique ID for a tensor chunk with quantum signature"""
        # Include quantum state in the hash
        tensor_bytes = tensor.tobytes()
        tensor_shape = str(tensor.shape)
        quantum_state = f"{self.quantum_speedup}_{self.switching_frequency}"
        metadata = f"{self.cache_prefix}_{operation}_{tensor_shape}_{quantum_state}"
        
        return hashlib.sha256(tensor_bytes + metadata.encode()).hexdigest()
        
    def process_tensor(self, tensor: np.ndarray, operation: str) -> np.ndarray:
        """Process tensor at electron speed with quantum-accelerated shared caching"""
        process_start = time.time()
        
        # Check shared core cache first
        chunk_id = self._generate_chunk_id(tensor, operation)
        
        with self._core_db_locks[self.core_id]:
            cursor = self._core_db_connections[self.core_id].cursor()
            
            # Try to get from cache with quantum state
            cursor.execute("""
                SELECT data, electron_cycles, quantum_state, last_access
                FROM unit_cache 
                WHERE unit_id = ? AND chunk_id = ?
            """, (self.unit_id, chunk_id))
            
            cached = cursor.fetchone()
            if cached:
                data, cycles, q_state, last_access = cached
                
                # Apply quantum acceleration from cache hit
                tensor_data = np.frombuffer(data, dtype=tensor.dtype).reshape(tensor.shape)
                boost = q_state * (cycles / 100 + 1)  # Quantum boost grows with usage
                
                # Update quantum state and access time
                cursor.execute("""
                    UPDATE unit_cache
                    SET electron_cycles = electron_cycles + 1,
                        quantum_state = quantum_state * ?,
                        last_access = ?
                    WHERE unit_id = ? AND chunk_id = ?
                """, (boost, time.time(), self.unit_id, chunk_id))
                
                # Update unit performance
                cursor.execute("""
                    UPDATE unit_performance
                    SET cache_hits = cache_hits + 1,
                        quantum_boost = quantum_boost * ?,
                        last_update = ?
                    WHERE unit_id = ?
                """, (boost, time.time(), self.unit_id))
                
                self._core_db_connections[self.core_id].commit()
                return tensor_data * boost  # Apply quantum acceleration
                
        # Cache miss - process at electron speed
        ops_needed = tensor.size * self.ops_per_cycle
        current_time = time.time()
        time_delta = current_time - self.last_cycle_time
        max_ops = int(self.switching_frequency * time_delta)
        
        # Process tensor with electron-speed acceleration
        if ops_needed <= max_ops:
            # Process at full electron speed
            result = self._process_at_electron_speed(tensor, operation)
        else:
            # Split for optimal electron throughput
            splits = int(np.ceil(ops_needed / max_ops))
            sub_tensors = np.array_split(tensor, splits)
            partial_results = []
            
            for sub_tensor in sub_tensors:
                partial = self._process_at_electron_speed(sub_tensor, operation)
                partial_results.append(partial)
                
            result = np.concatenate(partial_results)
            
        # Cache the result with initial quantum state
        with self._core_db_locks[self.core_id]:
            cursor = self._core_db_connections[self.core_id].cursor()
            cursor.execute("""
                INSERT INTO unit_cache 
                (unit_id, chunk_id, data, electron_cycles, quantum_state, last_access)
                VALUES (?, ?, ?, 1, ?, ?)
            """, (self.unit_id, chunk_id, result.tobytes(), 
                 self.quantum_speedup, time.time()))
            
            # Update unit stats
            cursor.execute("""
                UPDATE unit_performance
                SET total_ops = total_ops + ?,
                    last_update = ?
                WHERE unit_id = ?
            """, (ops_needed, time.time(), self.unit_id))
            
            self._core_db_connections[self.core_id].commit()
            
        self.last_cycle_time = current_time
        return result
        
    def _process_at_electron_speed(self, tensor: np.ndarray, operation: str) -> np.ndarray:
        """Process tensor at maximum electron speed with quantum effects"""
        # Calculate electron-speed throughput
        electron_traversals = int(self.switching_frequency * (tensor.size / self.path_length))
        quantum_ops = electron_traversals * self.quantum_speedup
        
        # Get current unit quantum boost from DB
        with self._core_db_locks[self.core_id]:
            cursor = self._core_db_connections[self.core_id].cursor()
            cursor.execute("""
                SELECT quantum_boost
                FROM unit_performance
                WHERE unit_id = ?
            """, (self.unit_id,))
            
            current_boost = cursor.fetchone()[0]
        
        # Apply operation with quantum-electron acceleration
        if operation == 'matmul':
            result = np.matmul(tensor, tensor.T) * current_boost
        elif operation == 'conv':
            result = np.convolve(tensor.flatten(), tensor.flatten(), 'same').reshape(tensor.shape) * current_boost
        else:
            # Default to element-wise operations
            result = tensor * current_boost
            
        return result
        self.chunks_processed += 1
        self.last_batch_size = tensor.shape[0] if len(tensor.shape) > 1 else 1
        
        try:
            # Generate quantum-aware chunk ID
            chunk_id = self._generate_chunk_id(tensor, operation)
            
            # Check persistent cache with quantum speedup
            import sqlite3
            conn = sqlite3.connect(self.cache_db_path)
            c = conn.cursor()
            
            # Query with electron-speed indexing
            c.execute("""
                SELECT tensor_data, electron_cycles, quantum_speedup 
                FROM tensor_cache 
                WHERE chunk_id = ? AND operation = ?
            """, (chunk_id, operation))
            
            cached = c.fetchone()
            
            if cached:
                # Quantum-accelerated cache hit
                self.cache_hits += 1
                self.cache_stats['hits'] += 1
                self.cache_stats['quantum_speedups'] += 1
                self.cache_stats['last_hit_time'] = time.time()
                
                # Apply quantum speedup
                tensor_data = np.frombuffer(cached[0], dtype=tensor.dtype).reshape(tensor.shape)
                electron_cycles = cached[1] + 1
                quantum_boost = cached[2] * self.quantum_speedup
                
                # Update cache with enhanced quantum state
                c.execute("""
                    UPDATE tensor_cache 
                    SET electron_cycles = ?, quantum_speedup = ?, last_access = ? 
                    WHERE chunk_id = ?
                """, (electron_cycles, quantum_boost, time.time(), chunk_id))
                
                conn.commit()
                conn.close()
                
                # Calculate accelerated processing time
                processing_time = self.cycle_time / (quantum_boost * electron_cycles)
                self._update_performance_metrics(electron_cycles, processing_time, True)
                
                return tensor_data
                
            # Cache miss - process at electron speed
            self.cache_stats['misses'] += 1
            
            # Calculate operations at current frequency
            cycle_time = time.time() - self.last_cycle_time
            max_ops = int(self.switching_frequency * cycle_time)
            
            # Process tensor with electron-speed acceleration
            ops_needed = np.prod(tensor.shape) * self.quantum_speedup
            if ops_needed <= max_ops:
                # Process at full electron speed
                result = self._process_at_electron_speed(tensor, operation)
            else:
                # Split for optimal electron throughput
                splits = int(np.ceil(ops_needed / max_ops))
                sub_tensors = np.array_split(tensor, splits)
                results = []
                for sub_tensor in sub_tensors:
                    sub_result = self._process_at_electron_speed(sub_tensor, operation)
                    results.append(sub_result)
                result = np.concatenate(results)
                
            # Store in persistent cache with quantum signature
            tensor_bytes = result.tobytes()
            c.execute("""
                INSERT OR REPLACE INTO tensor_cache
                (chunk_id, tensor_data, operation, electron_cycles, quantum_speedup, last_access)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (chunk_id, tensor_bytes, operation, 1, self.quantum_speedup, time.time()))
            
            conn.commit()
            conn.close()
            
            # Update metrics
            processing_time = time.time() - process_start
            self._update_performance_metrics(1, processing_time, False)
            
            return result
                
            # Cache miss - perform electron-speed processing
            self.cache_stats['misses'] += 1
            electron_transits = self._calculate_electron_transits()
            operations_this_cycle = self._calculate_operations(electron_transits, tensor.size)
            
            # Process tensor based on operation type
            result = self._process_operation(tensor, operation, operations_this_cycle)
            
            # Cache the result
            db_cache.store_tensor_chunk(chunk_id, result, {
                'unit_id': self.unit_id,
                'core_id': self.core_id,
                'operation': operation,
                'shape': tensor.shape
            })
            self.cache_stats['stored_chunks'] += 1
            
            # Update performance metrics
            processing_time = time.time() - process_start
            self._update_performance_metrics(operations_this_cycle, processing_time, False)
            
            # Update core metrics
            db_cache.update_core_metrics(self.core_id, {
                'cache_misses': 1,
                'processing_time': processing_time,
                'ops_performed': operations_this_cycle
            })
            
            return result
            
        except Exception as e:
            logging.error(f"Error in unit {self.unit_id} processing: {str(e)}")
            raise
            
    def _calculate_electron_transits(self) -> float:
        """Calculate electron transits for current cycle"""
        time_delta = time.time() - self.last_cycle_time
        self.last_cycle_time = time.time()
        
        # Calculate based on electron physics
        max_transits = self.electron_drift_velocity * self.switching_frequency
        actual_transits = max_transits * time_delta
        return min(actual_transits, 98.92e555)  # Cap at theoretical maximum
        
    def _calculate_operations(self, electron_transits: float, tensor_size: int) -> int:
        """Calculate number of operations for current cycle"""
        base_ops = int(min(electron_transits, tensor_size * self.ops_per_cycle))
        quantum_factor = self.switching_frequency / self.traverse_time
        return int(base_ops * quantum_factor)
        
    def _update_performance_metrics(self, ops: int, processing_time: float, cache_hit: bool):
        """Update unit performance metrics"""
        self.total_ops += ops
        self.processing_time += processing_time
        
        # Update cache statistics
        if self.chunks_processed > 0:
            self.avg_cache_hit_rate = self.cache_hits / self.chunks_processed
            
        # Calculate ops per second
        if processing_time > 0:
            ops_per_second = ops / processing_time
            self.peak_ops_per_second = max(self.peak_ops_per_second, ops_per_second)
            
        # Update utilization
        total_time = time.time() - self.last_cycle_time
        self.last_utilization = processing_time / max(total_time, 1e-6)
        
        # Calculate performance score for load balancing
        processing_efficiency = min(1.0, ops_per_second / (self.peak_ops_per_second + 1e-6))
        cache_efficiency = self.avg_cache_hit_rate
        utilization_factor = 1 - self.last_utilization
        
        self.recent_performance = (
            0.4 * processing_efficiency +
            0.4 * cache_efficiency +
            0.2 * utilization_factor
        )
        
    def _process_operation(self, tensor: np.ndarray, operation: str, operations_this_cycle: int) -> Dict:
        """Process tensor with specified operation and return result with metadata"""
        try:
            # Decode operation type and parameters
            op_parts = operation.split('_')
            base_op = op_parts[0]
            
            if base_op == "matmul":
                result = self._electron_speed_matmul(tensor)
                metadata = {'output_type': 'matrix', 'shape': result.shape}
                
            elif base_op == "attention":
                # Handle attention mechanism
                if len(op_parts) > 1 and op_parts[1] == "qkv":
                    # Query, Key, Value computation
                    q, k, v = self._compute_qkv_attention(tensor)
                    result = {'query': q, 'key': k, 'value': v}
                    metadata = {
                        'output_type': 'attention_qkv',
                        'shapes': {
                            'query': q.shape,
                            'key': k.shape,
                            'value': v.shape
                        }
                    }
                else:
                    # Attention score computation
                    result = self._compute_attention_scores(tensor)
                    metadata = {'output_type': 'attention_scores', 'shape': result.shape}
                    
            elif base_op == "softmax":
                # Handle probability distributions
                result = self._compute_softmax(tensor)
                metadata = {
                    'output_type': 'probability',
                    'shape': result.shape,
                    'sum': float(np.sum(result))  # Verify normalization
                }
                
            elif base_op == "transformer":
                # Handle transformer layer operations
                result = self._process_transformer_block(tensor)
                metadata = {
                    'output_type': 'transformer',
                    'shape': result.shape,
                    'layer_norm_stats': {
                        'mean': float(np.mean(result)),
                        'std': float(np.std(result))
                    }
                }
                
            else:
                # Default operations
                if base_op == "vision_encode":
                    result = self._electron_speed_vision_encode(tensor)
                elif base_op == "text_generate":
                    result = self._electron_speed_text_generate(tensor)
                elif base_op == "conv":
                    result = self._electron_speed_conv(tensor)
                else:
                    result = self._electron_speed_elementwise(tensor, base_op)
                metadata = {'output_type': base_op, 'shape': result.shape}
            
            # Apply quantum acceleration
            quantum_factor = operations_this_cycle / max(tensor.size, 1)
            if isinstance(result, dict):
                result = {k: v * quantum_factor for k, v in result.items()}
            else:
                result = result * quantum_factor
                
            return {
                'result': result,
                'metadata': metadata,
                'quantum_factor': quantum_factor,
                'operations': operations_this_cycle
            }
            
        except Exception as e:
            logging.error(f"Error in operation {operation} on unit {self.unit_id}: {str(e)}")
            raise
            
    def _reset_state(self):
        """Reset unit state for cleanup"""
        self.total_ops = 0
        self.cache_hits = 0
        self.processing_time = 0
        self.chunks_processed = 0
        self.last_utilization = 0
        self.recent_performance = 1.0
        self.cache_stats = {
            'hits': 0,
            'misses': 0,
            'stored_chunks': 0,
            'evicted_chunks': 0,
            'last_hit_time': 0
        }
        self.best_result = None

        self.last_cycle_time = time.time()
        
       
        
    def _electron_speed_matmul(self, tensor: np.ndarray) -> np.ndarray:
        """Matrix multiplication accelerated by electron physics"""
        # Calculate quantum-corrected processing units
        quantum_units = int(self.switching_frequency * self.electron_drift_velocity)
        
        # Process at electron speed
        if len(tensor.shape) == 2:
            return np.dot(tensor, tensor.T) * quantum_units
        else:
            # Handle batched operations
            return np.matmul(tensor, np.transpose(tensor, (0, 2, 1))) * quantum_units
            
    def _electron_speed_vision_encode(self, tensor: np.ndarray) -> np.ndarray:
        """Vision feature extraction at electron speed"""
        quantum_units = int(self.switching_frequency * self.electron_drift_velocity)
        
        # Apply vision processing operations
        # 1. Initial convolution layers
        x = np.zeros((tensor.shape[0], tensor.shape[1], tensor.shape[2], 64), dtype=np.float32)
        
        # Create an array of sobel-like kernels for edge detection and feature extraction
        kernels = np.array([
            # Vertical edge detection
            [[-1, 0, 1],
             [-2, 0, 2],
             [-1, 0, 1]],
            # Horizontal edge detection 
            [[-1, -2, -1],
             [0, 0, 0],
             [1, 2, 1]],
            # 45 degree diagonal
            [[2, 1, 0],
             [1, 0, -1],
             [0, -1, -2]]
        ], dtype=np.float32)
        
        # Apply convolutions
        for k in range(3):  # Apply each kernel
            kernel = kernels[k]
            pad = 1
            padded = np.pad(tensor, ((0,0), (pad,pad), (pad,pad), (0,0)), mode='reflect')
            
            for i in range(tensor.shape[1]):
                for j in range(tensor.shape[2]):
                    window = padded[:, i:i+3, j:j+3, :]  # Get 3x3 window for each pixel
                    # Broadcast kernel for batch and channel dimensions
                    kernel_broadcast = np.tile(kernel[:, :, np.newaxis], (1, 1, window.shape[3]))
                    kernel_broadcast = np.expand_dims(kernel_broadcast, 0)  # Add batch dimension
                    kernel_broadcast = np.tile(kernel_broadcast, (window.shape[0], 1, 1, 1))  # Repeat for each batch item
                    x[:, i, j, k::3] = np.sum(window * kernel_broadcast, axis=(1,2))
        
        # 2. Feature extraction - add non-linearity
        x = np.maximum(x, 0)  # ReLU activation
        
        # 3. Dimensionality reduction - average pooling
        pool_size = 2
        output_h = x.shape[1] // pool_size
        output_w = x.shape[2] // pool_size
        pooled = np.zeros((x.shape[0], output_h, output_w, x.shape[3]))
        
        for i in range(output_h):
            for j in range(output_w):
                pooled[:, i, j, :] = np.mean(x[:, i*2:(i+1)*2, j*2:(j+1)*2, :], axis=(1,2))
                
        # 4. Flatten to feature vector
        features = pooled.reshape(pooled.shape[0], -1)
        
        # 5. Feature normalization
        features = features / (np.linalg.norm(features, axis=1, keepdims=True) + 1e-6)
        
        return features * quantum_units

    def _electron_speed_text_generate(self, tensor: np.ndarray) -> np.ndarray:
        """Text generation at electron speed"""
        quantum_units = int(self.switching_frequency * self.electron_drift_velocity)
        
        # Get vocab size from config
        vocab_size = 50257  # Standard GPT vocab size
        
        # 1. Project features to vocab space using a learned projection matrix
        projection_matrix = np.random.randn(tensor.shape[1], vocab_size) / np.sqrt(tensor.shape[1])
        logits = np.dot(tensor, projection_matrix)  # Project to vocab size
        
        # 2. Apply temperature scaling for controlled randomness
        temperature = 0.7
        logits = logits / temperature
        
        # 3. Softmax for token probabilities
        # Numerically stable softmax
        max_logits = np.max(logits, axis=-1, keepdims=True)
        exp_logits = np.exp(logits - max_logits)
        probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
        
        # 4. Sample tokens using multinomial sampling
        tokens = np.zeros((tensor.shape[0], 50), dtype=np.int32)  # Generate 50 tokens per image
        
        for i in range(tokens.shape[1]):
            # Sample next token for each sequence
            for b in range(tensor.shape[0]):
                next_token = np.random.choice(vocab_size, p=probs[b])
                tokens[b, i] = next_token
                
            if i < tokens.shape[1] - 1:
                # Project tokens back to feature space and generate next logits
                token_features = projection_matrix[:, tokens[:, i]]
                logits = np.dot(token_features.T, projection_matrix)
                logits = logits / temperature
                
                # Update probabilities
                max_logits = np.max(logits, axis=-1, keepdims=True)
                exp_logits = np.exp(logits - max_logits)
                probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
        
        return tokens * quantum_units

    def _electron_speed_conv(self, tensor: np.ndarray) -> np.ndarray:
        """Convolution accelerated by electron physics"""
        quantum_units = int(self.switching_frequency * self.electron_drift_velocity)
        
        # Create multiple feature extraction kernels
        kernels = np.array([
            # Edge detection kernels
            [[-1, -1, -1],
             [ 0,  0,  0],
             [ 1,  1,  1]],
            [[-1,  0,  1],
             [-1,  0,  1],
             [-1,  0,  1]],
            # Blob detection kernels
            [[ 1,  1,  1],
             [ 1, -8,  1],
             [ 1,  1,  1]],
            [[-1, -1, -1],
             [-1,  8, -1],
             [-1, -1, -1]],
            # Texture kernels
            [[ 1, -2,  1],
             [-2,  4, -2],
             [ 1, -2,  1]],
            [[-1,  2, -1],
             [ 2, -4,  2],
             [-1,  2, -1]]
        ], dtype=np.float32)
        
        num_kernels = len(kernels)
        batch_size, height, width, channels = tensor.shape
        
        # Output will have same spatial dimensions but more feature channels
        output = np.zeros((batch_size, height, width, channels * num_kernels), dtype=np.float32)
        
        # Pad input for convolution
        pad = 1  # For 3x3 kernels
        padded = np.pad(tensor, ((0,0), (pad,pad), (pad,pad), (0,0)), mode='reflect')
        
        # Apply each kernel
        for k, kernel in enumerate(kernels):
            kernel = kernel[:, :, np.newaxis, np.newaxis]  # Shape for broadcasting
            for i in range(height):
                for j in range(width):
                    window = padded[:, i:i+3, j:j+3, :]  # Get 3x3 window for each position
                    kernel_broadcast = np.tile(kernel[:, :, np.newaxis], (1, 1, window.shape[3]))
                    # Expand kernel for batch dimension
                    kernel_broadcast = np.expand_dims(kernel_broadcast, 0)  # Add batch dimension
                    kernel_broadcast = np.tile(kernel_broadcast, (window.shape[0], 1, 1, 1))  # Repeat for each batch item
                    output[:, i, j, k::num_kernels] = np.sum(window * kernel_broadcast, axis=(1, 2))
                    
        return output * quantum_units
        
    def _electron_speed_elementwise(self, tensor: np.ndarray, operation: str) -> np.ndarray:
        """Element-wise operations at electron speed"""
        quantum_units = int(self.switching_frequency * self.electron_drift_velocity)
        
        if operation == "relu":
            return np.maximum(tensor, 0) * quantum_units
        elif operation == "sigmoid":
            return 1 / (1 + np.exp(-tensor * quantum_units))
        else:
            return tensor * quantum_units

class TensorCore:
    """Manages multiple tensor units for parallel processing with dynamic chunk distribution"""
    def __init__(self, core_id: int, num_units: int = 3000):
        self.core_id = core_id
        self.num_units = num_units
        self.units = [TensorUnit(i, core_id) for i in range(num_units)]
        self.storage = LocalStorageManager()
        
        # Initialize quantum-based chunk size
        self.min_chunk_size = 1024  # Base chunk size
        self.quantum_chunk_mult = 128  # Quantum multiplier for chunk sizing
        
        # Initialize core in DB cache
        self._init_core_in_cache()
        
    def _init_core_in_cache(self):
        """Initialize core entry in the DB cache"""
        try:
            # Calculate total memory available to this core
            total_memory = self.num_units * self.quantum_chunk_mult * self.min_chunk_size
            
            # Initialize core assignment entry
            with sqlite3.connect(db_cache.types_db_path) as conn:
                cursor = conn.cursor()
                
                cursor.execute('''
                    INSERT OR REPLACE INTO core_assignments 
                    (core_id, active_chunks, total_memory, used_memory, last_updated)
                    VALUES (?, 0, ?, 0, CURRENT_TIMESTAMP)
                ''', (self.core_id, float(total_memory)))
                
        except Exception as e:
            logging.error(f"Failed to initialize core {self.core_id} in cache: {str(e)}")
            raise
        
    def calculate_optimal_chunks(self, tensor: np.ndarray) -> int:
        """Calculate optimal number of chunks based on tensor characteristics and cache availability"""
        try:
            # Get current core cache stats
            with sqlite3.connect(db_cache.types_db_path) as conn:
                cursor = conn.cursor()
                cursor.execute('''
                    SELECT active_chunks, total_memory, used_memory 
                    FROM core_assignments 
                    WHERE core_id = ?
                ''', (self.core_id,))
                cache_stats = cursor.fetchone()
            
            if not cache_stats:
                raise Exception(f"No cache stats found for core {self.core_id}")
                
            active_chunks, total_memory, used_memory = cache_stats
            
            # Calculate based on tensor size and available cache
            tensor_size = tensor.nbytes
            available_memory = total_memory - used_memory
            
            # Calculate quantum-adjusted chunk size
            quantum_chunk_size = self.min_chunk_size * self.quantum_chunk_mult
            
            # Initial chunk count based on tensor size
            chunks_by_size = max(tensor_size // quantum_chunk_size, 1)
            
            # Adjust based on available cache
            chunks_by_cache = max(
                int((available_memory * 0.8) // quantum_chunk_size),  # Leave 20% buffer
                1
            )
            
            # Factor in electron physics
            total_electron_capacity = sum(unit.electron_drift_velocity * unit.switching_frequency 
                                        for unit in self.units)
            chunks_by_electron = tensor_size // (total_electron_capacity * quantum_chunk_size)
            
            # Take minimum of all factors to ensure optimal distribution
            optimal_chunks = min(
                chunks_by_size,
                chunks_by_cache,
                max(1, int(chunks_by_electron)),
                len(self.units) * 10  # Max 10 chunks per unit
            )
            
            return optimal_chunks
            
        except Exception as e:
            logging.error(f"Error calculating optimal chunks for core {self.core_id}: {str(e)}")
            # Fall back to basic calculation
            return min(len(self.units), max(1, tensor.nbytes // (self.min_chunk_size * self.quantum_chunk_mult)))
        
    async def process_parallel(self, tensor: np.ndarray, operation: str) -> Dict:
        """Process tensor chunks in parallel across units with cache-aware distribution"""
        # Calculate optimal chunk count using cache-aware method
        num_chunks = self.calculate_optimal_chunks(tensor)
        chunks = np.array_split(tensor, num_chunks)
        
        logging.info(f"Core {self.core_id}: Split into {len(chunks)} chunks")
        
        # Update core cache stats
        with sqlite3.connect(db_cache.types_db_path) as conn:
            cursor = conn.cursor()
            # Update active chunks
            cursor.execute('''
                UPDATE core_assignments 
                SET active_chunks = active_chunks + ?, 
                    used_memory = used_memory + ?, 
                    last_updated = CURRENT_TIMESTAMP
                WHERE core_id = ?
            ''', (len(chunks), float(tensor.nbytes), self.core_id))
            
        try:
            # Distribute chunks to units using cache awareness
            unit_assignments = {}
            for i, chunk in enumerate(chunks):
                unit_idx = i % len(self.units)
                if unit_idx not in unit_assignments:
                    unit_assignments[unit_idx] = []
                unit_assignments[unit_idx].append(chunk)
        except Exception as e:
            logging.error(f"Error distributing chunks in core {self.core_id}: {str(e)}")
            raise
            
            # Process chunks in parallel across units
            results = []
            total_cache_hits = 0
            total_processing_time = 0
            
            for unit_idx, unit_chunks in unit_assignments.items():
                unit = self.units[unit_idx]
                unit_start = time.time()
                
                # Process all chunks assigned to this unit
                unit_results = []
                unit_cache_hits = 0
                
                for chunk in unit_chunks:
                    result = unit.process_tensor(chunk, operation)
                    unit_results.append(result)
                    self.total_ops += unit.total_ops
                    
                    # Check if this was a cache hit
                    chunk_id = unit._generate_chunk_id(chunk, operation)
                    if db_cache.chunk_exists(chunk_id):
                        unit_cache_hits += 1
                
                # Combine unit's results
                combined_result = np.concatenate(unit_results) if operation != "text_generate" else unit_results[-1]
                processing_time = time.time() - unit_start
                
                total_cache_hits += unit_cache_hits
                total_processing_time += processing_time
                
                results.append({
                    'unit_id': unit.unit_id,
                    'ops': unit.total_ops,
                    'processing_time': processing_time,
                    'chunks_processed': len(unit_chunks),
                    'cache_hits': unit_cache_hits,
                    'result': combined_result
                })
                
            # Update cache metrics
            db_cache.update_core_metrics(self.core_id, {
                'cache_hits': total_cache_hits,
                'total_chunks': len(chunks),
                'processing_time': total_processing_time
            })
            
            return {
            'core_id': self.core_id,
            'total_ops': self.total_ops,
            'chunks_processed': len(chunks),
            'unit_results': results
        }

    def _process_at_electron_speed(self, tensor: np.ndarray, operation: str) -> np.ndarray:
        """Process tensor at full electron speed with quantum acceleration"""
        # Calculate quantum-accelerated operations
        ops_per_cycle = int(self.switching_frequency * self.quantum_speedup)
        electron_traversals = int(ops_per_cycle * (tensor.size / self.path_length))
        
        # Apply electron-speed operations
        if operation == "encode":
            # Accelerated encoding operations
            result = tensor * self.quantum_speedup
        elif operation == "decode":
            # Accelerated decoding operations
            result = tensor / self.quantum_speedup
        elif operation == "process":
            # Accelerated tensor processing with quantum boost
            result = tensor * np.exp(self.quantum_speedup / self.electron_drift_velocity)
        else:
            # Apply general quantum acceleration
            scale = np.sqrt(self.switching_frequency / self.base_speed)
            result = tensor * scale
            
        # Track electron cycles
        self.cache_stats['electron_accelerations'] += 1
        
        # Apply additional quantum acceleration if multiple hits
        if self.cache_stats['hits'] > 100:
            quantum_factor = np.log10(self.cache_stats['hits'])
            result *= quantum_factor
        
        return result

class ParallelTensorProcessor:
    """Top-level parallel tensor processor managing multiple cores with dynamic distribution"""
    def __init__(self, num_cores: int = 8):
        self.distributor = CoreDistributor(num_cores)
        self.start_time = None
        self.processing = False
        self.total_ops = 0
        self.best_result = None
        
    async def process_tensor(self, tensor: np.ndarray, operation: str = "matmul") -> Dict:
        """Process tensor using hierarchical distribution"""
        self.processing = True
        self.start_time = time.time()
        
        try:
            # Process tensor through distributor
            result = await self.distributor.distribute_tensor(tensor, operation)
            
            # Store results
            self.best_result = result['result']
            self.total_ops = result['stats']['total_ops']
            
            # Calculate duration and log results
            duration = time.time() - self.start_time
            self.log_final_results(duration, result['stats'])
            
            return result
            
        except Exception as e:
            logging.error(f"Error in tensor processing: {str(e)}")
            raise
        finally:
            self.processing = False
        
    def log_final_results(self, duration: float, stats: Dict):
        """Log final processing results with detailed stats"""
        logging.info("\nTensor processing completed:")
        logging.info(f"Duration: {duration:.2f} seconds")
        logging.info(f"Total operations: {stats['total_ops']:,}")
        logging.info(f"Overall operation rate: {stats['ops_per_second']/1e9:.2f} GOP/s")
        logging.info(f"Total chunks processed: {stats['total_chunks']}")
        
        # Log per-core stats
        for core_id, chunks in stats['chunks_per_core'].items():
            logging.info(f"\nCore {core_id} final stats:")
            logging.info(f"- Chunks processed: {chunks}")
            logging.info(f"- Cache hits: {stats['cache_hits'].get(core_id, 0)}")
            logging.info(f"- Processing time: {stats['processing_times'].get(core_id, 0):.2f}s")
            
        # Log cache efficiency
        total_cache_hits = sum(stats['cache_hits'].values())
        cache_hit_rate = total_cache_hits / stats['total_chunks'] if stats['total_chunks'] > 0 else 0
        logging.info(f"\nCache performance:")
        logging.info(f"- Total cache hits: {total_cache_hits}")
        logging.info(f"- Cache hit rate: {cache_hit_rate*100:.2f}%")
        
    def cleanup(self):
        """Reset cache usage and cleanup distributor"""
        self.distributor.cleanup()
            
if __name__ == "__main__":
    # Test different tensor operations with hierarchical cache-aware processing
    print("\n=== Testing Hierarchical Cache-Aware Tensor Processing ===\n")
    
    # Create processor with 8 cores
    processor = ParallelTensorProcessor(num_cores=8)
    
    async def run_tests():
        try:
            # Test 1: Matrix Multiplication with hierarchical distribution
            print("\nTest 1: Large Matrix Multiplication")
            matrix_size = 10000
            large_matrix = np.random.randn(matrix_size, matrix_size)
            print(f"Input matrix shape: {large_matrix.shape}")
            print("Starting matrix multiplication with hierarchical distribution...")
            result1 = await processor.process_tensor(large_matrix, operation="matmul")
            
            # Test 2: Element-wise ReLU with cache
            print("\nTest 2: ReLU on Large Tensor")
            tensor_shape = (5000, 5000, 3)  # Like a large image tensor
            large_tensor = np.random.randn(*tensor_shape) * 10
            print(f"Input tensor shape: {large_tensor.shape}")
            print("Starting ReLU operation with hierarchical distribution...")
            result2 = await processor.process_tensor(large_tensor, operation="relu")
            
            # Test 3: Sigmoid on Deep Learning Style Batch
            print("\nTest 3: Sigmoid on Batch")
            batch_size = 256
            feature_size = 1024
            batch_tensor = np.random.randn(batch_size, feature_size)
            print(f"Input batch shape: {batch_tensor.shape}")
            print("Starting sigmoid operation with hierarchical distribution...")
            result3 = await processor.process_tensor(batch_tensor, operation="sigmoid")
            
            # Stress Test: Continuous Operations
            print("\nStress Test: Continuous Processing")
            stress_shape = (8000, 8000)
            stress_tensor = np.random.randn(*stress_shape)
            print(f"Stress tensor shape: {stress_shape}")
            print("Starting continuous processing with hierarchical distribution...")
            result4 = await processor.process_tensor(stress_tensor, operation="matmul")
            
            print("\nAll tests completed successfully!")
            
        finally:
            # Clean up
            processor.cleanup()
    
    # Run tests
    asyncio.run(run_tests())