#include #include #include "cache.h" #include "logging.h" #include "rpi-base.h" #include "startup.h" #include "defs.h" // Historical Note: // Were seeing core 3 crashes if inner *and* outer both set to some flavour of WB (i.e. 1 or 3) // The point of crashing is when the data cache is enabled // At that point, the stack appears to vanish and the data read back is 0x55555555 // Reason turned out to be failure to correctly invalidate the entire data cache const static unsigned l1_cached_threshold = L2_CACHED_MEM_BASE >> 20; const static unsigned l2_cached_threshold = UNCACHED_MEM_BASE >> 20; volatile __attribute__ ((aligned (0x4000))) unsigned int PageTable[4096]; volatile __attribute__ ((aligned (0x4000))) unsigned int PageTable2[NUM_4K_PAGES]; const static int aa = 1; const static int bb = 1; const static int shareable = 1; #define SETWAY_LEVEL_SHIFT 1 // 4 ways x 128 sets x 64 bytes per line 32KB #define L1_DATA_CACHE_SETS 128 #define L1_DATA_CACHE_WAYS 4 #define L1_SETWAY_WAY_SHIFT 30 // 32-Log2(L1_DATA_CACHE_WAYS) #define L1_SETWAY_SET_SHIFT 6 // Log2(L1_DATA_CACHE_LINE_LENGTH) // 8 ways x 1024 sets x 64 bytes per line = 512KB #define PI2_L2_CACHE_SETS 1024 #define PI2_L2_CACHE_WAYS 8 #define PI2_L2_SETWAY_WAY_SHIFT 29 // 32-Log2(L2_CACHE_WAYS) // 16 ways x 512 sets x 64 bytes per line = 512KB #define PI3_L2_CACHE_SETS 512 #define PI3_L2_CACHE_WAYS 16 #define PI3_L2_SETWAY_WAY_SHIFT 28 // 32-Log2(L2_CACHE_WAYS) #define L2_SETWAY_SET_SHIFT 6 // Log2(L2_CACHE_LINE_LENGTH) // The origin of this function is: // https://github.com/rsta2/uspi/blob/master/env/lib/synchronize.c void InvalidateDataCache (void) { unsigned nSet; unsigned nWay; uint32_t nSetWayLevel; // invalidate L1 data cache for (nSet = 0; nSet < L1_DATA_CACHE_SETS; nSet++) { for (nWay = 0; nWay < L1_DATA_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << L1_SETWAY_WAY_SHIFT | nSet << L1_SETWAY_SET_SHIFT | 0 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c6, 2" : : "r" (nSetWayLevel) : "memory"); // DCISW } } if (_get_hardware_id() == _RPI2) { //Raspberry PI 2 // invalidate L2 unified cache for (nSet = 0; nSet < PI2_L2_CACHE_SETS; nSet++) { for (nWay = 0; nWay < PI2_L2_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << PI2_L2_SETWAY_WAY_SHIFT | nSet << L2_SETWAY_SET_SHIFT | 1 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c6, 2" : : "r" (nSetWayLevel) : "memory"); // DCISW } } } else { // invalidate L2 unified cache for (nSet = 0; nSet < PI3_L2_CACHE_SETS; nSet++) { for (nWay = 0; nWay < PI3_L2_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << PI3_L2_SETWAY_WAY_SHIFT | nSet << L2_SETWAY_SET_SHIFT | 1 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c6, 2" : : "r" (nSetWayLevel) : "memory"); // DCISW } } } } void CleanDataCache (void) { unsigned nSet; unsigned nWay; uint32_t nSetWayLevel; // clean L1 data cache for (nSet = 0; nSet < L1_DATA_CACHE_SETS; nSet++) { for (nWay = 0; nWay < L1_DATA_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << L1_SETWAY_WAY_SHIFT | nSet << L1_SETWAY_SET_SHIFT | 0 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c10, 2" : : "r" (nSetWayLevel) : "memory"); } } if (_get_hardware_id() == _RPI2) { //Raspberry PI 2 // clean L2 unified cache for (nSet = 0; nSet < PI2_L2_CACHE_SETS; nSet++) { for (nWay = 0; nWay < PI2_L2_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << PI2_L2_SETWAY_WAY_SHIFT | nSet << L2_SETWAY_SET_SHIFT | 1 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c10, 2" : : "r" (nSetWayLevel) : "memory"); } } } else { // clean L2 unified cache for (nSet = 0; nSet < PI3_L2_CACHE_SETS; nSet++) { for (nWay = 0; nWay < PI3_L2_CACHE_WAYS; nWay++) { nSetWayLevel = nWay << PI3_L2_SETWAY_WAY_SHIFT | nSet << L2_SETWAY_SET_SHIFT | 1 << SETWAY_LEVEL_SHIFT; asm volatile ("mcr p15, 0, %0, c7, c10, 2" : : "r" (nSetWayLevel) : "memory"); } } } } // TLB 4KB Section Descriptor format // 31..12 Section Base Address // 11..9 - unused, set to zero // 8..6 TEX - type extension- TEX, C, B used together, see below // 5..4 AP - access ctrl - set to 11 for full access from user and super modes // 3 C - cacheable - TEX, C, B used together, see below // 2 B - bufferable - TEX, C, B used together, see below // 1 1 // 0 1 void map_4k_page(int logical, int physical) { // Invalidate the data TLB before changing mapping _invalidate_dtlb_mva((void *)(logical << 12)); // Setup the 4K page table entry // Second level descriptors use extended small page format so inner/outer cacheing can be controlled // Pi 0/1: // XP (bit 23) in SCTRL is 0 so descriptors use ARMv4/5 backwards compatible format // Pi 2/3: // XP (bit 23) in SCTRL no longer exists, and we see to be using ARMv6 table formats // this means bit 0 of the page table is actually XN and must be clear to allow native ARM code to execute // (this was the cause of issue #27) if (_get_hardware_id() >= _RPI2) { PageTable2[logical] = (physical<<12) | 0x132 | (bb << 6) | (aa << 2); } else { PageTable2[logical] = (physical<<12) | 0x133 | (bb << 6) | (aa << 2); } } void enable_MMU_and_IDCaches(int cached_screen_area, int cached_screen_size) { log_debug("enable_MMU_and_IDCaches"); //log_debug("cpsr = %08x", _get_cpsr()); unsigned i; unsigned base; // TLB 1MB Sector Descriptor format // 31..20 Section Base Address // 19 NS - ? - set to 0 // 18 0 - - set to 0 // 17 nG - ? - set to 0 // 16 S - ? - set to 0 // 15 APX - access ctrl - set to 0 for full access from user and super modes // 14..12 TEX - type extension- TEX, C, B used together, see below // 11..10 AP - access ctrl - set to 11 for full access from user and super modes // 9 P - - set to 0 // 8..5 Domain- access domain - set to 0000 as nor using access ctrl // 4 XN - eXecute Never - set to 1 for I/O devices // 3 C - cacheable - set to 1 for cachable RAM i // 2 B - bufferable - set to 1 for cachable RAM // 1 1 - TEX, C, B used together, see below // 0 0 - TEX, C, B used together, see below // For I/O devices // TEX = 000; C=0; B=1 (Shared device) // For cacheable RAM // TEX = 001; C=1; B=1 (Outer and inner write back, write allocate) // For non-cachable RAM // TEX = 001; C=0; B=0 (Outer and inner non-cacheable) // For individual control // TEX = 1BB CB=AA // AA = inner policy // BB = outer policy // 00 = NC (non-cacheable) // 01 = WBWA (write-back, write allocate) // 10 = WT (write-through // 11 = WBNWA (write-back, no write allocate) /// TEX = 100; C=0; B=1 (outer non cacheable, inner write-back, write allocate) for (base = 0; base < l1_cached_threshold; base++) // 0x04000000 64MB { // Value from my original RPI code = 11C0E (outer and inner write back, write allocate, shareable) // bits 11..10 are the AP bits, and setting them to 11 enables user mode access as well // Values from RPI2 = 11C0E (outer and inner write back, write allocate, shareable (fast but unsafe)); works on RPI // Values from RPI2 = 10C0A (outer and inner write through, no write allocate, shareable) // Values from RPI2 = 15C0A (outer write back, write allocate, inner write through, no write allocate, shareable) PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12) | (aa << 2); } for (; base < l2_cached_threshold; base++) // 0x08000000 128MB { PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12); } for (; base < (_get_peripheral_base() >> 20); base++) { PageTable[base] = base << 20 | 0x01C02; } for (; base < 4096; base++) { // shared device, never execute PageTable[base] = base << 20 | 0x10C16; } #if defined(USE_CACHED_SCREEN) if (cached_screen_area != 0) { for (base = (cached_screen_area >> 20); base < ((cached_screen_area + cached_screen_size) >> 20); base++) { PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12) | (aa << 2); //cached part of screen ram } } #endif // suppress a warning as we really do want to copy from src address 0! #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wnonnull" // copy vectors from virtual address zero to a higher unused location // cppcheck-suppress nullPointer memcpy((void *)HIGH_VECTORS_BASE, (void *)0, 0x1000); #pragma GCC diagnostic pop // replace the first N 1MB entries with second level page tables, giving N x 256 4K pages for (i = 0; i < NUM_4K_PAGES >> 8; i++) { PageTable[i] = (unsigned int) (&PageTable2[i << 8]); PageTable[i] +=1; } // populate the second level page tables for (base = 0; base < NUM_4K_PAGES; base++) { map_4k_page(base, base); } // relocate the vector pointer to the moved page asm volatile("mcr p15, 0, %[addr], c12, c0, 0" : : [addr] "r" (HIGH_VECTORS_BASE)); if (_get_hardware_id() >= _RPI3) { unsigned cpuextctrl0, cpuextctrl1; asm volatile ("mrrc p15, 1, %0, %1, c15" : "=r" (cpuextctrl0), "=r" (cpuextctrl1)); //log_debug("extctrl = %08x %08x", cpuextctrl1, cpuextctrl0); } else { // RPI: bit 6 of auxctrl is restrict cache size to 16K (no page coloring) // RPI2: bit 6 of auxctrl is set SMP bit, otherwise all caching disabled unsigned auxctrl; asm volatile ("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctrl)); auxctrl |= 1 << 6; asm volatile ("mcr p15, 0, %0, c1, c0, 1" :: "r" (auxctrl)); asm volatile ("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctrl)); //log_debug("auxctrl = %08x", auxctrl); } // set domain 0 to client asm volatile ("mcr p15, 0, %0, c3, c0, 0" :: "r" (1)); // always use TTBR0 asm volatile ("mcr p15, 0, %0, c2, c0, 2" :: "r" (0)); unsigned ttbcr; asm volatile ("mrc p15, 0, %0, c2, c0, 2" : "=r" (ttbcr)); //log_debug("ttbcr = %08x", ttbcr); if (_get_hardware_id() >= _RPI2) { // set TTBR0 - page table walk memory cacheability/shareable // [Bit 0, Bit 6] indicates inner cachability: 01 = normal memory, inner write-back write-allocate cacheable // [Bit 4, Bit 3] indicates outer cachability: 01 = normal memory, outer write-back write-allocate cacheable // Bit 1 indicates sharable // 4A = 0100 1010 int attr = ((aa & 1) << 6) | (bb << 3) | (shareable << 1) | ((aa & 2) >> 1); asm volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r" (attr | (unsigned) &PageTable)); } else { // set TTBR0 (page table walk inner cacheable, outer non-cacheable, shareable memory) asm volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r" (0x03 | (unsigned) &PageTable)); } unsigned ttbr0; asm volatile ("mrc p15, 0, %0, c2, c0, 0" : "=r" (ttbr0)); //log_debug("ttbr0 = %08x", ttbr0); // Invalidate entire data cache if (_get_hardware_id() >= _RPI2) { asm volatile (".word 0xf57ff06f" ::: "memory"); // asm volatile ("isb" ::: "memory"); (won't compile on arm v6) InvalidateDataCache(); } else { // invalidate data cache and flush prefetch buffer // NOTE: The below code seems to cause a Pi 2 to crash asm volatile ("mcr p15, 0, %0, c7, c5, 4" :: "r" (0) : "memory"); asm volatile ("mcr p15, 0, %0, c7, c6, 0" :: "r" (0) : "memory"); } // enable MMU, L1 cache and instruction cache, L2 cache, write buffer, // branch prediction and extended page table on unsigned sctrl; asm volatile ("mrc p15,0,%0,c1,c0,0" : "=r" (sctrl)); // Bit 13 enable vector relocation // Bit 12 enables the L1 instruction cache // Bit 11 enables branch pre-fetching // Bit 2 enables the L1 data cache // Bit 0 enabled the MMU // The L1 instruction cache can be used independently of the MMU // The L1 data cache will one be enabled if the MMU is enabled sctrl |= 0x00001805; asm volatile ("mcr p15,0,%0,c1,c0,0" :: "r" (sctrl) : "memory"); asm volatile ("mrc p15,0,%0,c1,c0,0" : "=r" (sctrl)); //log_debug("sctrl = %08x", sctrl); // For information, show the cache type register // From this you can tell what type of cache is implemented unsigned ctype; asm volatile ("mrc p15,0,%0,c0,c0,1" : "=r" (ctype)); //log_debug("ctype = %08x", ctype); }