diff --git a/bl1/bl1.ld.S b/bl1/bl1.ld.S index b9554d15c..b69065ee4 100644 --- a/bl1/bl1.ld.S +++ b/bl1/bl1.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -133,7 +133,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss : ALIGN(16) { __BSS_START__ = .; diff --git a/bl1/bl1_fwu.c b/bl1/bl1_fwu.c index 1cc7daf62..f7fae6823 100644 --- a/bl1/bl1_fwu.c +++ b/bl1/bl1_fwu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -335,7 +335,7 @@ static int bl1_fwu_image_auth(unsigned int image_id, */ if (image_desc->state == IMAGE_STATE_COPIED) { /* Clear the memory.*/ - memset((void *)base_addr, 0, total_size); + zero_normalmem((void *)base_addr, total_size); flush_dcache_range(base_addr, total_size); /* Indicate that image can be copied again*/ diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S index 25363ace8..31f778790 100644 --- a/bl2/aarch64/bl2_entrypoint.S +++ b/bl2/aarch64/bl2_entrypoint.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -94,12 +94,12 @@ func bl2_entrypoint */ ldr x0, =__BSS_START__ ldr x1, =__BSS_SIZE__ - bl zeromem16 + bl zeromem #if USE_COHERENT_MEM ldr x0, =__COHERENT_RAM_START__ ldr x1, =__COHERENT_RAM_UNALIGNED_SIZE__ - bl zeromem16 + bl zeromem #endif /* -------------------------------------------- diff --git a/bl2/bl2.ld.S b/bl2/bl2.ld.S index fa694de28..b9275f346 100644 --- a/bl2/bl2.ld.S +++ b/bl2/bl2.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -113,7 +113,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss : ALIGN(16) { __BSS_START__ = .; diff --git a/bl2u/aarch64/bl2u_entrypoint.S b/bl2u/aarch64/bl2u_entrypoint.S index 1175c6ff1..9fa84bf42 100644 --- a/bl2u/aarch64/bl2u_entrypoint.S +++ b/bl2u/aarch64/bl2u_entrypoint.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -94,7 +94,7 @@ func bl2u_entrypoint */ ldr x0, =__BSS_START__ ldr x1, =__BSS_SIZE__ - bl zeromem16 + bl zeromem /* -------------------------------------------- * Allocate a stack whose memory will be marked diff --git a/bl2u/bl2u.ld.S b/bl2u/bl2u.ld.S index d72589fca..91e8556ed 100644 --- a/bl2u/bl2u.ld.S +++ b/bl2u/bl2u.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -100,7 +100,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss : ALIGN(16) { __BSS_START__ = .; diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S index 9a05e6c3c..e5d6232e5 100644 --- a/bl31/bl31.ld.S +++ b/bl31/bl31.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -158,7 +158,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss (NOLOAD) : ALIGN(16) { __BSS_START__ = .; diff --git a/bl32/sp_min/sp_min.ld.S b/bl32/sp_min/sp_min.ld.S index e0e23e8f9..f1d4d0b3f 100644 --- a/bl32/sp_min/sp_min.ld.S +++ b/bl32/sp_min/sp_min.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -134,9 +134,10 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 8-byte aligned for better performance of the + * zero-initialization code. */ - .bss (NOLOAD) : ALIGN(16) { + .bss (NOLOAD) : ALIGN(8) { __BSS_START__ = .; *(.bss*) *(COMMON) diff --git a/bl32/tsp/aarch64/tsp_entrypoint.S b/bl32/tsp/aarch64/tsp_entrypoint.S index 4c296d4a6..bdb882ab8 100644 --- a/bl32/tsp/aarch64/tsp_entrypoint.S +++ b/bl32/tsp/aarch64/tsp_entrypoint.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -120,12 +120,12 @@ func tsp_entrypoint */ ldr x0, =__BSS_START__ ldr x1, =__BSS_SIZE__ - bl zeromem16 + bl zeromem #if USE_COHERENT_MEM ldr x0, =__COHERENT_RAM_START__ ldr x1, =__COHERENT_RAM_UNALIGNED_SIZE__ - bl zeromem16 + bl zeromem #endif /* -------------------------------------------- diff --git a/bl32/tsp/tsp.ld.S b/bl32/tsp/tsp.ld.S index 7e24f66d7..d93e3bb08 100644 --- a/bl32/tsp/tsp.ld.S +++ b/bl32/tsp/tsp.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -104,7 +104,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss : ALIGN(16) { __BSS_START__ = .; diff --git a/common/bl_common.c b/common/bl_common.c index 47bdad5a8..1d6653075 100644 --- a/common/bl_common.c +++ b/common/bl_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -347,7 +347,7 @@ static int load_auth_image_internal(unsigned int image_id, image_data->image_size); if (rc != 0) { /* Authentication error, zero memory and flush it right away. */ - memset((void *)image_data->image_base, 0x00, + zero_normalmem((void *)image_data->image_base, image_data->image_size); flush_dcache_range(image_data->image_base, image_data->image_size); @@ -543,7 +543,7 @@ static int load_auth_image_internal(meminfo_t *mem_layout, image_data->image_size); if (rc != 0) { /* Authentication error, zero memory and flush it right away. */ - memset((void *)image_data->image_base, 0x00, + zero_normalmem((void *)image_data->image_base, image_data->image_size); flush_dcache_range(image_data->image_base, image_data->image_size); diff --git a/docs/firmware-design.md b/docs/firmware-design.md index bd6e2f697..6a20659d1 100644 --- a/docs/firmware-design.md +++ b/docs/firmware-design.md @@ -1342,7 +1342,7 @@ All BL images share the following requirements: The following linker symbols are defined for this purpose: -* `__BSS_START__` Must be aligned on a 16-byte boundary. +* `__BSS_START__` * `__BSS_SIZE__` * `__COHERENT_RAM_START__` Must be aligned on a page-size boundary. * `__COHERENT_RAM_END__` Must be aligned on a page-size boundary. diff --git a/include/common/aarch64/el3_common_macros.S b/include/common/aarch64/el3_common_macros.S index cbfa6eec7..2e70fc2e8 100644 --- a/include/common/aarch64/el3_common_macros.S +++ b/include/common/aarch64/el3_common_macros.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -252,12 +252,12 @@ ldr x0, =__BSS_START__ ldr x1, =__BSS_SIZE__ - bl zeromem16 + bl zeromem #if USE_COHERENT_MEM ldr x0, =__COHERENT_RAM_START__ ldr x1, =__COHERENT_RAM_UNALIGNED_SIZE__ - bl zeromem16 + bl zeromem #endif #ifdef IMAGE_BL1 diff --git a/include/lib/utils.h b/include/lib/utils.h index b6bc9af67..69bbb430a 100644 --- a/include/lib/utils.h +++ b/include/lib/utils.h @@ -80,4 +80,35 @@ # define ULL(_x) (_x##ull) #endif +/* + * C code should be put in this part of the header to avoid breaking ASM files + * or linker scripts including it. + */ +#if !(defined(__LINKER__) || defined(__ASSEMBLY__)) + +#include + +/* + * Fill a region of normal memory of size "length" in bytes with zero bytes. + * + * WARNING: This function can only operate on normal memory. This means that + * the MMU must be enabled when using this function. Otherwise, use + * zeromem. + */ +void zero_normalmem(void *mem, u_register_t length); + +/* + * Fill a region of memory of size "length" in bytes with null bytes. + * + * Unlike zero_normalmem, this function has no restriction on the type of + * memory targeted and can be used for any device memory as well as normal + * memory. This function must be used instead of zero_normalmem when MMU is + * disabled. + * + * NOTE: When data cache and MMU are enabled, prefer zero_normalmem for faster + * zeroing. + */ +void zeromem(void *mem, u_register_t length); +#endif /* !(defined(__LINKER__) || defined(__ASSEMBLY__)) */ + #endif /* __UTILS_H__ */ diff --git a/lib/aarch32/misc_helpers.S b/lib/aarch32/misc_helpers.S index bf4084a82..dc8479951 100644 --- a/lib/aarch32/misc_helpers.S +++ b/lib/aarch32/misc_helpers.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,6 +34,7 @@ .globl smc .globl zeromem + .globl zero_normalmem .globl memcpy4 .globl disable_mmu_icache_secure .globl disable_mmu_secure @@ -50,30 +51,108 @@ func smc endfunc smc /* ----------------------------------------------------------------------- - * void zeromem(void *mem, unsigned int length); + * void zeromem(void *mem, unsigned int length) + * + * Initialise a region in normal memory to 0. This functions complies with the + * AAPCS and can be called from C code. * - * Initialise a memory region to 0. - * The memory address and length must be 4-byte aligned. * ----------------------------------------------------------------------- */ func zeromem -#if ASM_ASSERTION - tst r0, #0x3 - ASM_ASSERT(eq) - tst r1, #0x3 - ASM_ASSERT(eq) -#endif - add r2, r0, r1 - mov r1, #0 -z_loop: - cmp r2, r0 - beq z_end - str r1, [r0], #4 - b z_loop -z_end: + /* + * Readable names for registers + * + * Registers r0, r1 and r2 are also set by zeromem which + * branches into the fallback path directly, so cursor, length and + * stop_address should not be retargeted to other registers. + */ + cursor .req r0 /* Start address and then current address */ + length .req r1 /* Length in bytes of the region to zero out */ + /* + * Reusing the r1 register as length is only used at the beginning of + * the function. + */ + stop_address .req r1 /* Address past the last zeroed byte */ + zeroreg1 .req r2 /* Source register filled with 0 */ + zeroreg2 .req r3 /* Source register filled with 0 */ + tmp .req r12 /* Temporary scratch register */ + + mov zeroreg1, #0 + + /* stop_address is the address past the last to zero */ + add stop_address, cursor, length + + /* + * Length cannot be used anymore as it shares the same register with + * stop_address. + */ + .unreq length + + /* + * If the start address is already aligned to 8 bytes, skip this loop. + */ + tst cursor, #(8-1) + beq .Lzeromem_8bytes_aligned + + /* Calculate the next address aligned to 8 bytes */ + orr tmp, cursor, #(8-1) + adds tmp, tmp, #1 + /* If it overflows, fallback to byte per byte zeroing */ + beq .Lzeromem_1byte_aligned + /* If the next aligned address is after the stop address, fall back */ + cmp tmp, stop_address + bhs .Lzeromem_1byte_aligned + + /* zero byte per byte */ +1: + strb zeroreg1, [cursor], #1 + cmp cursor, tmp + bne 1b + + /* zero 8 bytes at a time */ +.Lzeromem_8bytes_aligned: + + /* Calculate the last 8 bytes aligned address. */ + bic tmp, stop_address, #(8-1) + + cmp cursor, tmp + bhs 2f + + mov zeroreg2, #0 +1: + stmia cursor!, {zeroreg1, zeroreg2} + cmp cursor, tmp + blo 1b +2: + + /* zero byte per byte */ +.Lzeromem_1byte_aligned: + cmp cursor, stop_address + beq 2f +1: + strb zeroreg1, [cursor], #1 + cmp cursor, stop_address + bne 1b +2: bx lr + + .unreq cursor + /* + * length is already unreq'ed to reuse the register for another + * variable. + */ + .unreq stop_address + .unreq zeroreg1 + .unreq zeroreg2 + .unreq tmp endfunc zeromem +/* + * AArch32 does not have special ways of zeroing normal memory as AArch64 does + * using the DC ZVA instruction, so we just alias zero_normalmem to zeromem. + */ +.equ zero_normalmem, zeromem + /* -------------------------------------------------------------------------- * void memcpy4(void *dest, const void *src, unsigned int length) * diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S index 574146f6d..84265e0b2 100644 --- a/lib/aarch64/misc_helpers.S +++ b/lib/aarch64/misc_helpers.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,6 +37,8 @@ .globl eret .globl smc + .globl zero_normalmem + .globl zeromem .globl zeromem16 .globl memcpy16 @@ -80,31 +82,358 @@ endfunc smc * * Initialise a memory region to 0. * The memory address must be 16-byte aligned. + * NOTE: This function is deprecated and zeromem should be used instead. * ----------------------------------------------------------------------- */ -func zeromem16 +.equ zeromem16, zeromem + +/* ----------------------------------------------------------------------- + * void zero_normalmem(void *mem, unsigned int length); + * + * Initialise a region in normal memory to 0. This functions complies with the + * AAPCS and can be called from C code. + * + * NOTE: MMU must be enabled when using this function as it can only operate on + * normal memory. It is intended to be mainly used from C code when MMU + * is usually enabled. + * ----------------------------------------------------------------------- + */ +.equ zero_normalmem, zeromem_dczva + +/* ----------------------------------------------------------------------- + * void zeromem(void *mem, unsigned int length); + * + * Initialise a region of device memory to 0. This functions complies with the + * AAPCS and can be called from C code. + * + * NOTE: When data caches and MMU are enabled, zero_normalmem can usually be + * used instead for faster zeroing. + * + * ----------------------------------------------------------------------- + */ +func zeromem + /* x2 is the address past the last zeroed address */ + add x2, x0, x1 + /* + * Uses the fallback path that does not use DC ZVA instruction and + * therefore does not need enabled MMU + */ + b .Lzeromem_dczva_fallback_entry +endfunc zeromem + +/* ----------------------------------------------------------------------- + * void zeromem_dczva(void *mem, unsigned int length); + * + * Fill a region of normal memory of size "length" in bytes with null bytes. + * MMU must be enabled and the memory be of + * normal type. This is because this function internally uses the DC ZVA + * instruction, which generates an Alignment fault if used on any type of + * Device memory (see section D3.4.9 of the ARMv8 ARM, issue k). When the MMU + * is disabled, all memory behaves like Device-nGnRnE memory (see section + * D4.2.8), hence the requirement on the MMU being enabled. + * NOTE: The code assumes that the block size as defined in DCZID_EL0 + * register is at least 16 bytes. + * + * ----------------------------------------------------------------------- + */ +func zeromem_dczva + + /* + * The function consists of a series of loops that zero memory one byte + * at a time, 16 bytes at a time or using the DC ZVA instruction to + * zero aligned block of bytes, which is assumed to be more than 16. + * In the case where the DC ZVA instruction cannot be used or if the + * first 16 bytes loop would overflow, there is fallback path that does + * not use DC ZVA. + * Note: The fallback path is also used by the zeromem function that + * branches to it directly. + * + * +---------+ zeromem_dczva + * | entry | + * +----+----+ + * | + * v + * +---------+ + * | checks |>o-------+ (If any check fails, fallback) + * +----+----+ | + * | |---------------+ + * v | Fallback path | + * +------+------+ |---------------+ + * | 1 byte loop | | + * +------+------+ .Lzeromem_dczva_initial_1byte_aligned_end + * | | + * v | + * +-------+-------+ | + * | 16 bytes loop | | + * +-------+-------+ | + * | | + * v | + * +------+------+ .Lzeromem_dczva_blocksize_aligned + * | DC ZVA loop | | + * +------+------+ | + * +--------+ | | + * | | | | + * | v v | + * | +-------+-------+ .Lzeromem_dczva_final_16bytes_aligned + * | | 16 bytes loop | | + * | +-------+-------+ | + * | | | + * | v | + * | +------+------+ .Lzeromem_dczva_final_1byte_aligned + * | | 1 byte loop | | + * | +-------------+ | + * | | | + * | v | + * | +---+--+ | + * | | exit | | + * | +------+ | + * | | + * | +--------------+ +------------------+ zeromem + * | | +----------------| zeromem function | + * | | | +------------------+ + * | v v + * | +-------------+ .Lzeromem_dczva_fallback_entry + * | | 1 byte loop | + * | +------+------+ + * | | + * +-----------+ + */ + + /* + * Readable names for registers + * + * Registers x0, x1 and x2 are also set by zeromem which + * branches into the fallback path directly, so cursor, length and + * stop_address should not be retargeted to other registers. + */ + cursor .req x0 /* Start address and then current address */ + length .req x1 /* Length in bytes of the region to zero out */ + /* Reusing x1 as length is never used after block_mask is set */ + block_mask .req x1 /* Bitmask of the block size read in DCZID_EL0 */ + stop_address .req x2 /* Address past the last zeroed byte */ + block_size .req x3 /* Size of a block in bytes as read in DCZID_EL0 */ + tmp1 .req x4 + tmp2 .req x5 + #if ASM_ASSERTION - tst x0, #0xf - ASM_ASSERT(eq) + /* + * Check for M bit (MMU enabled) of the current SCTLR_EL(1|3) + * register value and panic if the MMU is disabled. + */ +#if defined(IMAGE_BL1) || defined(IMAGE_BL31) + mrs tmp1, sctlr_el3 +#else + mrs tmp1, sctlr_el1 #endif - add x2, x0, x1 -/* zero 16 bytes at a time */ -z_loop16: - sub x3, x2, x0 - cmp x3, #16 - b.lt z_loop1 - stp xzr, xzr, [x0], #16 - b z_loop16 -/* zero byte per byte */ -z_loop1: - cmp x0, x2 - b.eq z_end - strb wzr, [x0], #1 - b z_loop1 -z_end: + + tst tmp1, #SCTLR_M_BIT + ASM_ASSERT(ne) +#endif /* ASM_ASSERTION */ + + /* stop_address is the address past the last to zero */ + add stop_address, cursor, length + + /* + * Get block_size = (log2() >> 2) (see encoding of + * dczid_el0 reg) + */ + mrs block_size, dczid_el0 + + /* + * Select the 4 lowest bits and convert the extracted log2() to + */ + ubfx block_size, block_size, #0, #4 + mov tmp2, #(1 << 2) + lsl block_size, tmp2, block_size + +#if ASM_ASSERTION + /* + * Assumes block size is at least 16 bytes to avoid manual realignment + * of the cursor at the end of the DCZVA loop. + */ + cmp block_size, #16 + ASM_ASSERT(hs) +#endif + /* + * Not worth doing all the setup for a region less than a block and + * protects against zeroing a whole block when the area to zero is + * smaller than that. Also, as it is assumed that the block size is at + * least 16 bytes, this also protects the initial aligning loops from + * trying to zero 16 bytes when length is less than 16. + */ + cmp length, block_size + b.lo .Lzeromem_dczva_fallback_entry + + /* + * Calculate the bitmask of the block alignment. It will never + * underflow as the block size is between 4 bytes and 2kB. + * block_mask = block_size - 1 + */ + sub block_mask, block_size, #1 + + /* + * length alias should not be used after this point unless it is + * defined as a register other than block_mask's. + */ + .unreq length + + /* + * If the start address is already aligned to zero block size, go + * straight to the cache zeroing loop. This is safe because at this + * point, the length cannot be smaller than a block size. + */ + tst cursor, block_mask + b.eq .Lzeromem_dczva_blocksize_aligned + + /* + * Calculate the first block-size-aligned address. It is assumed that + * the zero block size is at least 16 bytes. This address is the last + * address of this initial loop. + */ + orr tmp1, cursor, block_mask + add tmp1, tmp1, #1 + + /* + * If the addition overflows, skip the cache zeroing loops. This is + * quite unlikely however. + */ + cbz tmp1, .Lzeromem_dczva_fallback_entry + + /* + * If the first block-size-aligned address is past the last address, + * fallback to the simpler code. + */ + cmp tmp1, stop_address + b.hi .Lzeromem_dczva_fallback_entry + + /* + * If the start address is already aligned to 16 bytes, skip this loop. + * It is safe to do this because tmp1 (the stop address of the initial + * 16 bytes loop) will never be greater than the final stop address. + */ + tst cursor, #0xf + b.eq .Lzeromem_dczva_initial_1byte_aligned_end + + /* Calculate the next address aligned to 16 bytes */ + orr tmp2, cursor, #0xf + add tmp2, tmp2, #1 + /* If it overflows, fallback to the simple path (unlikely) */ + cbz tmp2, .Lzeromem_dczva_fallback_entry + /* + * Next aligned address cannot be after the stop address because the + * length cannot be smaller than 16 at this point. + */ + + /* First loop: zero byte per byte */ +1: + strb wzr, [cursor], #1 + cmp cursor, tmp2 + b.ne 1b +.Lzeromem_dczva_initial_1byte_aligned_end: + + /* + * Second loop: we need to zero 16 bytes at a time from cursor to tmp1 + * before being able to use the code that deals with block-size-aligned + * addresses. + */ + cmp cursor, tmp1 + b.hs 2f +1: + stp xzr, xzr, [cursor], #16 + cmp cursor, tmp1 + b.lo 1b +2: + + /* + * Third loop: zero a block at a time using DC ZVA cache block zeroing + * instruction. + */ +.Lzeromem_dczva_blocksize_aligned: + /* + * Calculate the last block-size-aligned address. If the result equals + * to the start address, the loop will exit immediately. + */ + bic tmp1, stop_address, block_mask + + cmp cursor, tmp1 + b.hs 2f +1: + /* Zero the block containing the cursor */ + dc zva, cursor + /* Increment the cursor by the size of a block */ + add cursor, cursor, block_size + cmp cursor, tmp1 + b.lo 1b +2: + + /* + * Fourth loop: zero 16 bytes at a time and then byte per byte the + * remaining area + */ +.Lzeromem_dczva_final_16bytes_aligned: + /* + * Calculate the last 16 bytes aligned address. It is assumed that the + * block size will never be smaller than 16 bytes so that the current + * cursor is aligned to at least 16 bytes boundary. + */ + bic tmp1, stop_address, #15 + + cmp cursor, tmp1 + b.hs 2f +1: + stp xzr, xzr, [cursor], #16 + cmp cursor, tmp1 + b.lo 1b +2: + + /* Fifth and final loop: zero byte per byte */ +.Lzeromem_dczva_final_1byte_aligned: + cmp cursor, stop_address + b.eq 2f +1: + strb wzr, [cursor], #1 + cmp cursor, stop_address + b.ne 1b +2: ret -endfunc zeromem16 + /* Fallback for unaligned start addresses */ +.Lzeromem_dczva_fallback_entry: + /* + * If the start address is already aligned to 16 bytes, skip this loop. + */ + tst cursor, #0xf + b.eq .Lzeromem_dczva_final_16bytes_aligned + + /* Calculate the next address aligned to 16 bytes */ + orr tmp1, cursor, #15 + add tmp1, tmp1, #1 + /* If it overflows, fallback to byte per byte zeroing */ + cbz tmp1, .Lzeromem_dczva_final_1byte_aligned + /* If the next aligned address is after the stop address, fall back */ + cmp tmp1, stop_address + b.hs .Lzeromem_dczva_final_1byte_aligned + + /* Fallback entry loop: zero byte per byte */ +1: + strb wzr, [cursor], #1 + cmp cursor, tmp1 + b.ne 1b + + b .Lzeromem_dczva_final_16bytes_aligned + + .unreq cursor + /* + * length is already unreq'ed to reuse the register for another + * variable. + */ + .unreq stop_address + .unreq block_size + .unreq block_mask + .unreq tmp1 + .unreq tmp2 +endfunc zeromem_dczva /* -------------------------------------------------------------------------- * void memcpy16(void *dest, const void *src, unsigned int length) diff --git a/plat/arm/css/common/css_bl2_setup.c b/plat/arm/css/common/css_bl2_setup.c index 11ca34230..5361d897e 100644 --- a/plat/arm/css/common/css_bl2_setup.c +++ b/plat/arm/css/common/css_bl2_setup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -98,7 +98,7 @@ void bl2_platform_setup(void) * - restoring the SCP boot configuration. */ VERBOSE("BL2: Restoring SCP reset data in Trusted SRAM\n"); - memset((void *) ARM_TRUSTED_SRAM_BASE, 0, 128); + zero_normalmem((void *)ARM_TRUSTED_SRAM_BASE, 128); mmio_write_32(SCP_BOOT_CFG_ADDR, scp_boot_config); } #endif /* EL3_PAYLOAD_BASE */ diff --git a/plat/mediatek/mt6795/bl31.ld.S b/plat/mediatek/mt6795/bl31.ld.S index 44510a75c..472cd2e0d 100644 --- a/plat/mediatek/mt6795/bl31.ld.S +++ b/plat/mediatek/mt6795/bl31.ld.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -113,7 +113,8 @@ SECTIONS /* * The .bss section gets initialised to 0 at runtime. - * Its base address must be 16-byte aligned. + * Its base address should be 16-byte aligned for better performance of the + * zero-initialization code. */ .bss (NOLOAD) : ALIGN(16) { __BSS_START__ = .; diff --git a/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c b/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c index 40d1bab06..689f2d7cd 100644 --- a/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c +++ b/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,8 +37,6 @@ #include #include -extern void zeromem16(void *mem, unsigned int length); - #define TEGRA_GPU_RESET_REG_OFFSET 0x28c #define GPU_RESET_BIT (1 << 24) @@ -114,13 +112,13 @@ static void tegra_clear_videomem(uintptr_t non_overlap_area_start, * Perform cache maintenance to ensure that the non-overlapping area is * zeroed out. The first invalidation of this range ensures that * possible evictions of dirty cache lines do not interfere with the - * 'zeromem16' operation. Other CPUs could speculatively prefetch the + * 'zeromem' operation. Other CPUs could speculatively prefetch the * main memory contents of this area between the first invalidation and - * the 'zeromem16' operation. The second invalidation ensures that any + * the 'zeromem' operation. The second invalidation ensures that any * such cache lines are removed as well. */ inv_dcache_range(non_overlap_area_start, non_overlap_area_size); - zeromem16((void *)non_overlap_area_start, non_overlap_area_size); + zeromem((void *)non_overlap_area_start, non_overlap_area_size); inv_dcache_range(non_overlap_area_start, non_overlap_area_size); }