std::vector<;uint8_t>;当C++11/14启用时,手动复制而不是调用memcpy

std::vector<uint8_t> manually copying instead of calling memcpy when C++11/14 enabled

本文关键字:复制 memcpy 调用 启用 uint8 lt vector gt std C++11      更新时间:2023-10-16

使用gcc 4.9,使用Linaro工具链对ARM进行交叉编译,我发现在添加-std=c++14时,vector.assign()的编译结果发生了变化,从而产生了严重的性能问题。

我已经尝试了几种不同的方法来完成分配+复制,但只要我使用std::vector来完成,它们都有这个性能问题

我可以用这个玩具例子重现这个问题:

VectorTest.h

#include <stdint.h>
#include <stddef.h>
#include <vector>
struct VectorWrapper_t
{
VectorWrapper_t(uint8_t const* pData, size_t length);
std::vector<uint8_t> data;
};

VectorTest.cpp

#include "VectorTest.h"
VectorWrapper_t::VectorWrapper_t(uint8_t const* pData, size_t length)
{
data.assign(pData, pData + length);
}

gcc标志:

-std=c++14 
-mthumb -march=armv7-a -mtune=cortex-a9 
-mlittle-endian -mfloat-abi=hard -mfpu=neon -Wa,-mimplicit-it=thumb 
-O2 -g

通过查看程序集,我可以看出原因:原始版本(我假设是C++03?)调用memmove,而C++14版本添加了一个额外的循环,看起来像是手动复制数据。看看gcc与-fverbose-asm一起添加的.loc标签,这个循环中的指令来自stl_construct.hstl_uninitialized.h

更改为gcc 5.2.1(使用C++14),它的编译与C++03示例几乎相同,只是使用memcpy而不是memmove

我可以通过使用std::unique_ptr<uint8_t[]>而不是vector来解决这个问题。然而,我想深入了解这个问题,了解其他使用vector的地方是否存在性能问题,以及如何潜在地解决这些问题(更新到gcc 5.2并不实用)。

所以我的问题是:为什么它在C++11/14下的编译方式不同

供参考,gcc --version报告:
arm-linux-gnueabihf-gcc (Linaro GCC 4.9-2014.12) 4.9.3 20141205 (prerelease)

以下是gcc生成的程序集:

# C++03, gcc 4.9
push    {r3, r4, r5, r6, r7, lr}    @
movs    r3, #0  @ tmp118,
mov r4, r0  @ this, this
str r3, [r0]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_start
mov r5, r2  @ length, length
str r3, [r0, #4]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
str r3, [r0, #8]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
cbnz    r2, .L19    @ length,
mov r0, r4  @, this
pop {r3, r4, r5, r6, r7, pc}    @
.L19:
mov r0, r2  @, length
mov r6, r1  @ pData, pData
bl  _Znwj   @
mov r2, r5  @, length
mov r1, r6  @, pData
mov r7, r0  @ D.13516,
bl  memmove @
ldr r0, [r4]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_start
cbz r0, .L3 @ D.13515,
bl  _ZdlPv  @
.L3:
add r5, r5, r7  @ D.13515, D.13516
str r7, [r4]    @ D.13516, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_start
str r5, [r4, #4]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_finish
mov r0, r4  @, this
str r5, [r4, #8]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_end_of_storage
pop {r3, r4, r5, r6, r7, pc}    @
.L6:
ldr r0, [r4]    @ D.13515, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
cbz r0, .L5 @ D.13515,
bl  _ZdlPv  @
.L5:
bl  __cxa_end_cleanup   @
# C++14, gcc 4.9
push    {r3, r4, r5, r6, r7, lr}    @
movs    r3, #0  @ tmp157,
mov r6, r0  @ this, this
str r3, [r0]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_start
mov r5, r2  @ length, length
str r3, [r0, #4]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
str r3, [r0, #8]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
cbnz    r2, .L25    @ length,
mov r0, r6  @, this
pop {r3, r4, r5, r6, r7, pc}    @
.L25:
mov r0, r2  @, length
mov r4, r1  @ pData, pData
bl  _Znwj   @
adds    r3, r4, r5  @ D.20345, pData, length
mov r7, r0  @ __result,
cmp r4, r3  @ pData, D.20345
ittt    ne
addne   r1, r4, #-1 @ ivtmp.76, pData,
movne   r3, r0  @ __result, __result
addne   r4, r0, r5  @ D.20346, __result, length
beq .L26    @,
.L7:
ldrb    r2, [r1, #1]!   @ zero_extendqisi2  @ D.20348, MEM[base: _48, offset: 0]
cbz r3, .L6 @ __result,
strb    r2, [r3]    @ D.20348, MEM[base: __result_23, offset: 0B]
.L6:
adds    r3, r3, #1  @ __result, __result,
cmp r3, r4  @ __result, D.20346
bne .L7 @,
.L8:
ldr r0, [r6]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_start
cbz r0, .L5 @ D.20346,
bl  _ZdlPv  @
.L5:
str r7, [r6]    @ __result, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_start
mov r0, r6  @, this
str r4, [r6, #4]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_finish
str r4, [r6, #8]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_end_of_storage
pop {r3, r4, r5, r6, r7, pc}    @
.L26:
adds    r4, r0, r5  @ D.20346, __result, length
b   .L8 @
.L11:
ldr r0, [r6]    @ D.20346, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
cbz r0, .L10    @ D.20346,
bl  _ZdlPv  @
.L10:
bl  __cxa_end_cleanup   @
# C++14, gcc 5.2
push    {r3, r4, r5, r6, r7, lr}    @
movs    r3, #0  @ tmp118,
mov r4, r0  @ this, this
str r3, [r0]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_start
str r3, [r0, #4]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
str r3, [r0, #8]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
cbnz    r2, .L19    @ length,
mov r0, r4  @, this
pop {r3, r4, r5, r6, r7, pc}    @
.L19:
mov r0, r2  @, length
mov r6, r1  @ pData, pData
mov r5, r2  @ length, length
bl  _Znwj   @
mov r2, r5  @, length
mov r1, r6  @, pData
mov r7, r0  @ D.20824,
bl  memcpy  @
ldr r0, [r4]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_start
cbz r0, .L3 @ D.20823,
bl  _ZdlPv  @
.L3:
add r5, r5, r7  @ D.20823, D.20824
str r7, [r4]    @ D.20824, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_start
str r5, [r4, #4]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_finish
mov r0, r4  @, this
str r5, [r4, #8]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_end_of_storage
pop {r3, r4, r5, r6, r7, pc}    @
.L6:
ldr r0, [r4]    @ D.20823, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
cbz r0, .L5 @ D.20823,
bl  _ZdlPv  @
.L5:
bl  __cxa_end_cleanup   @

这是4.9.2版本中的GCC错误,请参阅PR 64476。默认的-std=gnu++03模式和-std=c++14模式之间的区别在于,对于C++11及更高版本,可能存在不可赋值的琐碎类型(因为它们可能具有已删除的赋值运算符),这导致std::uninitialized_copy的实现采用不同(较慢)的代码路径。可分配性的检查是错误的,这意味着我们在不需要的时候走了慢路

我两年前为GCC 4.9.3修复了它,但您的编译器是基于4.9.2和4.9.3版本之间的快照,而且几周前还没有修复。

你可以要求Linaro将他们的GCC 4.9编译器更新到4.9.4,或者至少应用修复这个错误的补丁。