前缀和后缀++的性能差异

用benchmark测试迭代器前缀和后缀++的性能

结论

经测试:

Release版本下并无差异
vector结果稳定
list测试波动大，由于内存不连续，影响因素多，但前缀和后缀写法区别不打

测试1(vector)

#include <benchmark/benchmark.h>
#include <vector>
#include <string>

struct MyClass {
    std::string data;

    MyClass() : data(100, 'x') {}
    MyClass(const MyClass& other) : data(other.data) {}
    MyClass& operator=(const MyClass& other) {
        data = other.data;
        return *this;
    }
};

// 使用前缀 ++it
static void BM_IteratorPrefixIncrement(benchmark::State& state) {
    std::vector<MyClass> vec(1000);
    for (auto _ : state) {
        for (auto it = vec.begin(); it != vec.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_IteratorPrefixIncrement);

// 使用后缀 it++
static void BM_IteratorPostfixIncrement(benchmark::State& state) {
    std::vector<MyClass> vec(1000);
    for (auto _ : state) {
        for (auto it = vec.begin(); it != vec.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_IteratorPostfixIncrement);

BENCHMARK_MAIN();

[main] 正在生成文件夹: d:/Vscode/code/test/build 
[build] 正在启动生成
[proc] 执行命令: D:\make_tools\cmake\bin\cmake.EXE --build d:/Vscode/code/test/build --config Release --target test_pow -j 18 --
[build] 适用于 .NET Framework MSBuild 版本 17.9.8+b34f75857
[build] 
[build]   test_pow.vcxproj -> D:\Vscode\code\test\build\Release\test_pow.exe
[driver] 生成完毕: 00:00:00.741
[build] 生成已完成，退出代码为 0

PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-03T23:54:07+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
----------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations
----------------------------------------------------------------------
BM_IteratorPrefixIncrement        1450 ns         1444 ns       497778
BM_IteratorPostfixIncrement       1442 ns         1444 ns       497778
PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-03T23:54:11+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
----------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations
----------------------------------------------------------------------
BM_IteratorPrefixIncrement        1442 ns         1444 ns       497778
BM_IteratorPostfixIncrement       1447 ns         1444 ns       497778
PS D:\Vscode\code\test\build\Release>

测试2(HeavyContainer)：

#include <benchmark/benchmark.h>
#include <vector>
#include <string>
#include <iostream>

// 模拟一个开销大的结构
struct Heavy {
    std::string data;
    Heavy() : data(100, 'x') {}
    Heavy(const Heavy& other) : data(other.data) {}
    Heavy& operator=(const Heavy& other) {
        data = other.data;
        return *this;
    }
};

// 自定义容器的迭代器
class HeavyIterator {
public:
    using iterator_category = std::forward_iterator_tag;
    using value_type = Heavy;
    using difference_type = std::ptrdiff_t;
    using pointer = Heavy*;
    using reference = Heavy&;

    HeavyIterator(pointer ptr) : ptr_(ptr) {}
    HeavyIterator(const HeavyIterator& other) : ptr_(other.ptr_) {}

    reference operator*() const { return *ptr_; }

    // 前缀 ++
    HeavyIterator& operator++() {
        ++ptr_;
        return *this;
    }

    // 后缀 ++，返回副本（模拟实际 STL）
    HeavyIterator operator++(int) {
        HeavyIterator tmp = *this;
        ++ptr_;
        return tmp;
    }

    bool operator!=(const HeavyIterator& other) const {
        return ptr_ != other.ptr_;
    }

private:
    pointer ptr_;
};

// 自定义容器（包装 vector）
class HeavyContainer {
public:
    HeavyContainer(size_t n) : data_(n) {}

    HeavyIterator begin() { return HeavyIterator(data_.data()); }
    HeavyIterator end() { return HeavyIterator(data_.data() + data_.size()); }

private:
    std::vector<Heavy> data_;
};


static void BM_CustomIteratorPrefix(benchmark::State& state) {
    HeavyContainer container(1000);
    for (auto _ : state) {
        for (auto it = container.begin(); it != container.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_CustomIteratorPrefix);

static void BM_CustomIteratorPostfix(benchmark::State& state) {
    HeavyContainer container(1000);
    for (auto _ : state) {
        for (auto it = container.begin(); it != container.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_CustomIteratorPostfix);

BENCHMARK_MAIN();

[main] 正在生成文件夹: d:/Vscode/code/test/build 
[build] 正在启动生成
[proc] 执行命令: D:\make_tools\cmake\bin\cmake.EXE --build d:/Vscode/code/test/build --config Release --target test_pow -j 18 --
[build] 适用于 .NET Framework MSBuild 版本 17.9.8+b34f75857
[build] 
[build]   test_pow.vcxproj -> D:\Vscode\code\test\build\Release\test_pow.exe
[driver] 生成完毕: 00:00:00.704
[build] 生成已完成，退出代码为 0

PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-03T23:55:35+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
-------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
BM_CustomIteratorPrefix        1903 ns         1883 ns       373333
BM_CustomIteratorPostfix       1885 ns         1842 ns       373333
PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-03T23:56:03+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
-------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
BM_CustomIteratorPrefix        1879 ns         1883 ns       373333
BM_CustomIteratorPostfix       1887 ns         1859 ns       344615
PS D:\Vscode\code\test\build\Release>

测试3(Vector、List、GenericVector)：

#include <benchmark/benchmark.h>
#include <vector>
#include <list>
#include <string>

// ----------------------------
// 重型结构体（模拟拷贝代价）
struct Heavy {
    std::string data;

    Heavy() : data(100, 'x') {}
    Heavy(const Heavy& other) : data(other.data) {}
    Heavy& operator=(const Heavy& other) {
        data = other.data;
        return *this;
    }
};

// ----------------------------
// 场景 1：vector迭代器对比
static void BM_VectorPrefix(benchmark::State& state) {
    std::vector<Heavy> vec(1000);
    for (auto _ : state) {
        for (auto it = vec.begin(); it != vec.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_VectorPrefix);

static void BM_VectorPostfix(benchmark::State& state) {
    std::vector<Heavy> vec(1000);
    for (auto _ : state) {
        for (auto it = vec.begin(); it != vec.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_VectorPostfix);

// ----------------------------
// 场景 2：list迭代器对比（构造成本更真实）
static void BM_ListPrefix(benchmark::State& state) {
    std::list<Heavy> lst(1000);
    for (auto _ : state) {
        for (auto it = lst.begin(); it != lst.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_ListPrefix);

static void BM_ListPostfix(benchmark::State& state) {
    std::list<Heavy> lst(1000);
    for (auto _ : state) {
        for (auto it = lst.begin(); it != lst.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_ListPostfix);

// ----------------------------
// 场景 3：真正使用后缀 ++ 返回值（阻止优化）
static void BM_UseReturnOfPostfix(benchmark::State& state) {
    std::vector<Heavy> vec(1000);
    for (auto _ : state) {
        for (auto it = vec.begin(); it != vec.end(); ) {
            auto prev = it++;  // 强制保留临时副本
            benchmark::DoNotOptimize(*prev);
        }
    }
}
BENCHMARK(BM_UseReturnOfPostfix);

// ----------------------------
// 场景 4：模板函数，模拟泛型代码调用
template<typename Iterator>
static void IteratePrefix(benchmark::State& state, Iterator begin, Iterator end) {
    for (auto _ : state) {
        for (auto it = begin; it != end; ++it)
            benchmark::DoNotOptimize(*it);
    }
}

template<typename Iterator>
static void IteratePostfix(benchmark::State& state, Iterator begin, Iterator end) {
    for (auto _ : state) {
        for (auto it = begin; it != end; it++)
            benchmark::DoNotOptimize(*it);
    }
}

static void BM_GenericVectorPrefix(benchmark::State& state) {
    std::vector<Heavy> vec(1000);
    IteratePrefix(state, vec.begin(), vec.end());
}
BENCHMARK(BM_GenericVectorPrefix);

static void BM_GenericVectorPostfix(benchmark::State& state) {
    std::vector<Heavy> vec(1000);
    IteratePostfix(state, vec.begin(), vec.end());
}
BENCHMARK(BM_GenericVectorPostfix);

// ----------------------------
BENCHMARK_MAIN();

[main] 正在生成文件夹: d:/Vscode/code/test/build 
[build] 正在启动生成
[proc] 执行命令: D:\make_tools\cmake\bin\cmake.EXE --build d:/Vscode/code/test/build --config Release --target test_pow -j 18 --
[build] 适用于 .NET Framework MSBuild 版本 17.9.8+b34f75857
[build] 
[build]   test_pow.vcxproj -> D:\Vscode\code\test\build\Release\test_pow.exe
[driver] 生成完毕: 00:00:00.708
[build] 生成已完成，退出代码为 0

PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-04T00:01:29+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
------------------------------------------------------------------
Benchmark                        Time             CPU   Iterations
------------------------------------------------------------------
BM_VectorPrefix               1690 ns         1688 ns       407273
BM_VectorPostfix              1678 ns         1674 ns       448000
BM_ListPrefix                 2047 ns         2051 ns       320000
BM_ListPostfix                2001 ns         2002 ns       320000
BM_UseReturnOfPostfix         1442 ns         1444 ns       497778
BM_GenericVectorPrefix        1484 ns         1475 ns       497778
BM_GenericVectorPostfix       1496 ns         1507 ns       497778
PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-04T00:01:36+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
------------------------------------------------------------------
Benchmark                        Time             CPU   Iterations
------------------------------------------------------------------
BM_VectorPrefix               1689 ns         1688 ns       407273
BM_VectorPostfix              1685 ns         1688 ns       407273
BM_ListPrefix                 2001 ns         2002 ns       320000
BM_ListPostfix                2107 ns         2086 ns       344615
BM_UseReturnOfPostfix         1435 ns         1413 ns       497778
BM_GenericVectorPrefix        1483 ns         1507 ns       497778
BM_GenericVectorPostfix       1497 ns         1507 ns       497778
PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-04T00:01:43+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
------------------------------------------------------------------
Benchmark                        Time             CPU   Iterations
------------------------------------------------------------------
BM_VectorPrefix               1698 ns         1709 ns       448000
BM_VectorPostfix              1684 ns         1688 ns       407273
BM_ListPrefix                 2046 ns         2040 ns       344615
BM_ListPostfix                1989 ns         1995 ns       344615
BM_UseReturnOfPostfix         1443 ns         1430 ns       448000
BM_GenericVectorPrefix        1472 ns         1465 ns       448000
BM_GenericVectorPostfix       1475 ns         1465 ns       448000
PS D:\Vscode\code\test\build\Release>

测试4（List）:

#include <benchmark/benchmark.h>
#include <list>
#include <string>

struct Heavy {
    std::string data;
    Heavy() : data(100, 'x') {}
    Heavy(const Heavy& other) : data(other.data) {}
    Heavy& operator=(const Heavy& other) {
        data = other.data;
        return *this;
    }
};

static void BM_ListPrefix(benchmark::State& state) {
    std::list<Heavy> lst(1000);
    for (auto _ : state) {
        for (auto it = lst.begin(); it != lst.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_ListPrefix)->Repetitions(10)->ReportAggregatesOnly(true);

static void BM_ListPostfix(benchmark::State& state) {
    std::list<Heavy> lst(1000);
    for (auto _ : state) {
        for (auto it = lst.begin(); it != lst.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
    }
}
BENCHMARK(BM_ListPostfix)->Repetitions(10)->ReportAggregatesOnly(true);

BENCHMARK_MAIN();

PS D:\Vscode\code\test\build\Release> ./test_pow.exe --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
2025-07-04T00:07:06+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ListPrefix/repeats:10_mean          2107 ns         2104 ns           10
BM_ListPrefix/repeats:10_median        2122 ns         2131 ns           10
BM_ListPrefix/repeats:10_stddev        71.9 ns         86.0 ns           10
BM_ListPostfix/repeats:10_mean         2065 ns         2058 ns           10
BM_ListPostfix/repeats:10_median       2067 ns         2063 ns           10
BM_ListPostfix/repeats:10_stddev       67.5 ns         74.7 ns           10
PS D:\Vscode\code\test\build\Release> ./test_pow.exe --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
2025-07-04T00:07:23+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ListPrefix/repeats:10_mean          2095 ns         2095 ns           10
BM_ListPrefix/repeats:10_median        2110 ns         2100 ns           10
BM_ListPrefix/repeats:10_stddev         112 ns          109 ns           10
BM_ListPostfix/repeats:10_mean         2138 ns         2131 ns           10
BM_ListPostfix/repeats:10_median       2120 ns         2131 ns           10
BM_ListPostfix/repeats:10_stddev        119 ns          115 ns           10
PS D:\Vscode\code\test\build\Release> ./test_pow.exe --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
2025-07-04T00:07:40+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ListPrefix/repeats:10_mean          2041 ns         2036 ns           10
BM_ListPrefix/repeats:10_median        2039 ns         2040 ns           10
BM_ListPrefix/repeats:10_stddev        60.7 ns         65.7 ns           10
BM_ListPostfix/repeats:10_mean         2103 ns         2104 ns           10
BM_ListPostfix/repeats:10_median       2082 ns         2086 ns           10
BM_ListPostfix/repeats:10_stddev       95.2 ns         91.2 ns           10
PS D:\Vscode\code\test\build\Release> ./test_pow.exe --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
2025-07-04T00:08:05+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
---------------------------------------------------------------------------
Benchmark                                 Time             CPU   Iterations
---------------------------------------------------------------------------
BM_ListPrefix/repeats:10_mean          2029 ns         2031 ns           10
BM_ListPrefix/repeats:10_median        2006 ns         2018 ns           10
BM_ListPrefix/repeats:10_stddev         100 ns         99.8 ns           10
BM_ListPostfix/repeats:10_mean         2071 ns         2072 ns           10
BM_ListPostfix/repeats:10_median       2060 ns         2063 ns           10
BM_ListPostfix/repeats:10_stddev       59.6 ns         52.6 ns           10
PS D:\Vscode\code\test\build\Release>

测试5（List）:

#include <benchmark/benchmark.h>
#include <list>
#include <string>
#include <atomic>
#include <iostream>

struct Heavy {
    std::string data;
    static std::atomic<int> copy_count;

    Heavy() : data(100, 'x') {}
    Heavy(const Heavy& other) : data(other.data) {
        ++copy_count;
    }
    Heavy& operator=(const Heavy& other) {
        data = other.data;
        return *this;
    }
};

std::atomic<int> Heavy::copy_count{0};

static void BM_ListPrefix(benchmark::State& state) {
    for (auto _ : state) {
        Heavy::copy_count = 0;
        std::list<Heavy> lst(1000);
        for (auto it = lst.begin(); it != lst.end(); ++it) {
            benchmark::DoNotOptimize(*it);
        }
        benchmark::ClobberMemory();
        // 把结果写进 state，方便输出
        state.counters["CopyCount"] = Heavy::copy_count.load();
    }
}
BENCHMARK(BM_ListPrefix)->Repetitions(5)->ReportAggregatesOnly(true);

static void BM_ListPostfix(benchmark::State& state) {
    for (auto _ : state) {
        Heavy::copy_count = 0;
        std::list<Heavy> lst(1000);
        for (auto it = lst.begin(); it != lst.end(); it++) {
            benchmark::DoNotOptimize(*it);
        }
        benchmark::ClobberMemory();
        state.counters["CopyCount"] = Heavy::copy_count.load();
    }
}
BENCHMARK(BM_ListPostfix)->Repetitions(5)->ReportAggregatesOnly(true);

BENCHMARK_MAIN();

PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-04T00:09:58+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
------------------------------------------------------------------------------------------
Benchmark                                Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------------------------
BM_ListPrefix/repeats:5_mean         74997 ns        74912 ns            5 CopyCount=0
BM_ListPrefix/repeats:5_median       74554 ns        74557 ns            5 CopyCount=0
BM_ListPrefix/repeats:5_stddev        1267 ns         1485 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_mean        73618 ns        73382 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_median      73442 ns        72545 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_stddev        419 ns         1248 ns            5 CopyCount=0
PS D:\Vscode\code\test\build\Release> ."D:/Vscode/code/test/build/Release/test_pow.exe"
2025-07-04T00:10:17+08:00
Running D:\Vscode\code\test\build\Release\test_pow.exe
Run on (16 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 512 KiB (x8)
  L3 Unified 16384 KiB (x1)
------------------------------------------------------------------------------------------
Benchmark                                Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------------------------
BM_ListPrefix/repeats:5_mean         72748 ns        72782 ns            5 CopyCount=0
BM_ListPrefix/repeats:5_median       72344 ns        72782 ns            5 CopyCount=0
BM_ListPrefix/repeats:5_stddev         828 ns         1255 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_mean        73727 ns        73661 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_median      73452 ns        73940 ns            5 CopyCount=0
BM_ListPostfix/repeats:5_stddev       1375 ns         1167 ns            5 CopyCount=0
PS D:\Vscode\code\test\build\Release>

测试5的汇编分析：

Compiler Explorer

采用的编译语句：

1	`-std=c++17 -mtune=core2 -march=nocona -O3`

#include <list>
#include <string>
#include <atomic>
#include <iostream>

struct Heavy {
    std::string data;
    static std::atomic<int> copy_count;

    Heavy() : data(100, 'x') {}
    Heavy(const Heavy& other) : data(other.data) {
        ++copy_count;
    }
    Heavy& operator=(const Heavy& other) {
        data = other.data;
        return *this;
    }
};

std::atomic<int> Heavy::copy_count{0};

void test_prefix() {
    Heavy::copy_count = 0;
    std::list<Heavy> lst(1000);
    for (auto it = lst.begin(); it != lst.end(); ++it) {
        volatile Heavy const& h = *it; // 防优化
        (void)h;
    }
    std::cout << "Prefix increment copy count: " << Heavy::copy_count << "\n";
}

void test_postfix() {
    Heavy::copy_count = 0;
    std::list<Heavy> lst(1000);
    for (auto it = lst.begin(); it != lst.end(); it++) {
        volatile Heavy const& h = *it; // 防优化
        (void)h;
    }
    std::cout << "Postfix increment copy count: " << Heavy::copy_count << "\n";
}

int main() {
    test_prefix();
    test_postfix();
    return 0;
}

std::__cxx11::_List_base<Heavy, std::allocator<Heavy>>::_M_clear():
        push    r12
        push    rbp
        push    rbx
        mov     rbp, QWORD PTR [rdi]
        cmp     rbp, rdi
        je      .L1
        mov     r12, rdi
.L4:
        mov     rbx, rbp
        mov     rbp, QWORD PTR [rbp+0]
        mov     rdi, QWORD PTR [rbx+16]
        lea     rax, [rbx+32]
        cmp     rdi, rax
        je      .L3
        mov     rax, QWORD PTR [rbx+32]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L3:
        mov     esi, 48
        mov     rdi, rbx
        call    operator delete(void*, unsigned long)
        cmp     rbp, r12
        jne     .L4
.L1:
        pop     rbx
        pop     rbp
        pop     r12
        ret
.LC0:
        .string "Prefix increment copy count: "
.LC1:
        .string "\n"
test_prefix():
        push    r12
        xor     eax, eax
        push    rbp
        push    rbx
        sub     rsp, 32
        xchg    eax, DWORD PTR Heavy::copy_count[rip]
        mov     ebp, 1000
        mov     QWORD PTR [rsp+16], 0
        movq    xmm1, rsp
        movddup xmm0, xmm1
        movaps  XMMWORD PTR [rsp], xmm0
.L9:
        mov     edi, 48
        call    operator new(unsigned long)
        mov     rbx, rax
        mov     edx, 120
        mov     esi, 100
        lea     rdi, [rax+16]
        lea     rax, [rax+32]
        mov     QWORD PTR [rbx+16], rax
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
        mov     rsi, rsp
        mov     rdi, rbx
        call    std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)
        add     QWORD PTR [rsp+16], 1
        sub     rbp, 1
        jne     .L9
        mov     edx, 29
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     edi, OFFSET FLAT:std::cout
        mov     esi, DWORD PTR Heavy::copy_count[rip]
        call    std::ostream::operator<<(int)
        mov     edx, 1
        mov     esi, OFFSET FLAT:.LC1
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     rbp, QWORD PTR [rsp]
        cmp     rbp, rsp
        je      .L8
.L15:
        mov     rbx, rbp
        mov     rbp, QWORD PTR [rbp+0]
        mov     rdi, QWORD PTR [rbx+16]
        lea     rax, [rbx+32]
        cmp     rdi, rax
        je      .L14
        mov     rax, QWORD PTR [rbx+32]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L14:
        mov     esi, 48
        mov     rdi, rbx
        call    operator delete(void*, unsigned long)
        cmp     rbp, rsp
        jne     .L15
.L8:
        add     rsp, 32
        pop     rbx
        pop     rbp
        pop     r12
        ret
        mov     rbp, rax
        jmp     .L12
        mov     rbx, rax
        jmp     .L16
        mov     rbp, rax
        jmp     .L11
test_prefix() (.cold):
.LC3:
        .string "Postfix increment copy count: "
test_postfix():
        push    r12
        xor     eax, eax
        push    rbp
        push    rbx
        sub     rsp, 32
        xchg    eax, DWORD PTR Heavy::copy_count[rip]
        mov     ebp, 1000
        mov     QWORD PTR [rsp+16], 0
        movq    xmm1, rsp
        movddup xmm0, xmm1
        movaps  XMMWORD PTR [rsp], xmm0
.L25:
        mov     edi, 48
        call    operator new(unsigned long)
        mov     rbx, rax
        mov     edx, 120
        mov     esi, 100
        lea     rdi, [rax+16]
        lea     rax, [rax+32]
        mov     QWORD PTR [rbx+16], rax
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
        mov     rsi, rsp
        mov     rdi, rbx
        call    std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)
        add     QWORD PTR [rsp+16], 1
        sub     rbp, 1
        jne     .L25
        mov     edx, 30
        mov     esi, OFFSET FLAT:.LC3
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     edi, OFFSET FLAT:std::cout
        mov     esi, DWORD PTR Heavy::copy_count[rip]
        call    std::ostream::operator<<(int)
        mov     edx, 1
        mov     esi, OFFSET FLAT:.LC1
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     rbp, QWORD PTR [rsp]
        cmp     rbp, rsp
        je      .L24
.L31:
        mov     rbx, rbp
        mov     rbp, QWORD PTR [rbp+0]
        mov     rdi, QWORD PTR [rbx+16]
        lea     rax, [rbx+32]
        cmp     rdi, rax
        je      .L30
        mov     rax, QWORD PTR [rbx+32]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L30:
        mov     esi, 48
        mov     rdi, rbx
        call    operator delete(void*, unsigned long)
        cmp     rbp, rsp
        jne     .L31
.L24:
        add     rsp, 32
        pop     rbx
        pop     rbp
        pop     r12
        ret
        mov     rbp, rax
        jmp     .L28
        mov     rbx, rax
        jmp     .L32
        mov     rbp, rax
        jmp     .L27
test_postfix() (.cold):
main:
        sub     rsp, 8
        call    test_prefix()
        call    test_postfix()
        xor     eax, eax
        add     rsp, 8
        ret
Heavy::copy_count:
        .zero   4

AI分析：

Claude

让我详细对比下两个函数的汇编代码结构：

函数入口部分完全相同：

push    r12
xor     eax, eax
push    rbp
push    rbx
sub     rsp, 32
xchg    eax, DWORD PTR Heavy::copy_count[rip]
mov     ebp, 1000
mov     QWORD PTR [rsp+16], 0
movq    xmm1, rsp
movddup xmm0, xmm1
movaps  XMMWORD PTR [rsp], xmm0

主循环部分完全相同：

mov     edi, 48
call    operator new(unsigned long)
mov     rbx, rax
mov     edx, 120
mov     esi, 100
lea     rdi, [rax+16]
lea     rax, [rax+32]
mov     QWORD PTR [rbx+16], rax
call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
mov     rsi, rsp
mov     rdi, rbx
call    std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)
add     QWORD PTR [rsp+16], 1
sub     rbp, 1
jne     .L9/.L25

输出部分除了字符串长度和内容外，结构完全相同：

// prefix用29，postfix用30
        mov     edx, 29/30  
        mov     esi, OFFSET FLAT:.LC0/.LC3
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)

清理部分完全相同：

        mov     rbp, QWORD PTR [rsp]
        cmp     rbp, rsp
        je      .L8/.L24
...
        add     rsp, 32
        pop     rbx
        pop     rbp
        pop     r12
        ret

错误处理部分完全相同：

mov     rbp, rax
jmp     .L12/.L28
mov     rbx, rax
jmp     .L16/.L32
mov     rbp, rax
jmp     .L11/.L27

通过仔细分析，可以确认除了以下差异外：
6. 标签命名（如.L9 vs .L25）
7. 输出字符串的内容（”Prefix increment copy count: “ vs “Postfix increment copy count: “）
8. 输出字符串的长度（29 vs 30）

这两个函数的汇编代码是完全相同的。这进一步证实了在现代编译器优化下，前缀++和后缀++在这种场景下确实没有任何性能差异。编译器生成了完全相同的机器代码来处理这两种操作。

Qwen

从你提供的汇编代码中，test_prefix() 和 test_postfix() 的汇编逻辑几乎完全一致，除了以下几点：

1. 汇编差异点

(1) 函数名和输出字符串
• 函数名：test_prefix() vs test_postfix()（汇编中的标签不同）。
• 输出字符串：
◦ test_prefix() 输出 .LC0: “Prefix increment copy count: “
◦ test_postfix() 输出 .LC3: “Postfix increment copy count: “
(2) .cold 段的调用
• test_prefix() 和 test_postfix() 都有 .cold 段的调用，但地址不同（.L8 vs .L24），这是编译器将冷代码（如析构、异常处理）分离到 .cold 段的优化策略，与 ++i/i++ 无关。

2. 核心逻辑的汇编完全一致

关键循环体对比：
; test_prefix() 的循环体

.L9:
    mov     edi, 48
    call    operator new(unsigned long)
    mov     rbx, rax
    mov     edx, 120
    mov     esi, 100
    lea     rdi, [rax+16]
    lea     rax, [rax+32]
    mov     QWORD PTR [rbx+16], rax
    call    std::__cxx11::basic_string::_M_construct
    mov     rsi, rsp
    mov     rdi, rbx
    call    std::__detail::_List_node_base::_M_hook
    add     QWORD PTR [rsp+16], 1
    sub     rbp, 1
    jne     .L9

; test_postfix() 的循环体
.L25:
    mov     edi, 48
    call    operator new(unsigned long)
    mov     rbx, rax
    mov     edx, 120
    mov     esi, 100
    lea     rdi, [rax+16]
    lea     rax, [rax+32]
    mov     QWORD PTR [rbx+16], rax
    call    std::__cxx11::basic_string::_M_construct
    mov     rsi, rsp
    mov     rdi, rbx
    call    std::__detail::_List_node_base::_M_hook
    add     QWORD PTR [rsp+16], 1
    sub     rbp, 1
    jne     .L25

结论：
• 指令序列完全一致：从 operator new 到 _M_hook 的调用，再到循环计数器的更新（sub rbp, 1），两者的汇编代码完全相同。
• 无额外的临时对象处理：后缀 i++ 通常会创建临时对象，但汇编中未看到任何额外的 mov、call 或 lea 指令（如拷贝构造或临时对象销毁）。

3. 为什么汇编中没有差异？

编译器优化（-O3）

• 内联优化：编译器将 std::list::iterator 的后缀 i++ 内联展开，消除了临时对象的开销。
• 死代码消除：Heavy::copy_count 的统计可能被优化掉，因为其结果未被实际使用（仅输出到控制台）。

std::list 迭代器的特殊性

• std::list 的迭代器通常是双向指针结构（如 std::__detail::_List_node_base*），其后缀 i++ 只需复制指针值（rax），开销极低。

• 示例：

mov rax, QWORD PTR [rbx+16] ; 获取当前节点的 next 指针—

4. 是否存在隐藏的性能差异？

如果禁用优化（-O0）

• 后缀 i++ 会显式生成临时对象，汇编中会看到：
mov rsi, rax ; 复制当前迭代器
call Heavy::Heavy(Heavy const&) ; 拷贝构造临时对象• 性能差异显著：临时对象的创建和销毁会引入额外开销。

如果 Heavy 类未被优化

• 如果 Heavy 的拷贝构造未被内联或优化，后缀 i++ 会触发额外的 call 指令（如 Heavy::Heavy(Heavy const&)）。

测试6（vector的汇编）：

#include <vector>
#include <string>
#include <atomic>
#include <iostream>

struct MyClass {
    std::string data;
    static std::atomic<int> copy_count;

    MyClass() : data(100, 'x') {}
    MyClass(const MyClass& other) : data(other.data) {
        ++copy_count;
    }
    MyClass& operator=(const MyClass& other) {
        data = other.data;
        return *this;
    }
};

std::atomic<int> MyClass::copy_count{0};

// 使用前缀 ++it
void test_prefix() {
    MyClass::copy_count = 0;
    std::vector<MyClass> vec(1000);
    for (auto it = vec.begin(); it != vec.end(); ++it) {
        volatile const MyClass& ref = *it;  // 防止优化
        (void)ref;
    }
    std::cout << "Prefix increment copy count: " << MyClass::copy_count << "\n";
}

// 使用后缀 it++
void test_postfix() {
    MyClass::copy_count = 0;
    std::vector<MyClass> vec(1000);
    for (auto it = vec.begin(); it != vec.end(); it++) {
        volatile const MyClass& ref = *it;  // 防止优化
        (void)ref;
    }
    std::cout << "Postfix increment copy count: " << MyClass::copy_count << "\n";
}

int main() {
    test_prefix();
    test_postfix();
    return 0;
}

1	`-std=c++17 -mtune=core2 -march=nocona -O3`

std::vector<MyClass, std::allocator<MyClass>>::~vector() [base object destructor]:
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        mov     rbp, QWORD PTR [rdi+8]
        mov     rbx, QWORD PTR [rdi]
        cmp     rbp, rbx
        je      .L2
.L4:
        mov     rdi, QWORD PTR [rbx]
        lea     rax, [rbx+16]
        cmp     rdi, rax
        je      .L3
        mov     rax, QWORD PTR [rbx+16]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L3:
        add     rbx, 32
        cmp     rbp, rbx
        jne     .L4
        mov     rbx, QWORD PTR [r12]
.L2:
        test    rbx, rbx
        je      .L1
        mov     rsi, QWORD PTR [r12+16]
        mov     rdi, rbx
        sub     rsi, rbx
        pop     rbx
        pop     rbp
        pop     r12
        jmp     operator delete(void*, unsigned long)
.L1:
        pop     rbx
        pop     rbp
        pop     r12
        ret
.LC0:
        .string "Prefix increment copy count: "
.LC1:
        .string "\n"
test_prefix():
        push    r14
        xor     eax, eax
        push    r12
        push    rbp
        push    rbx
        sub     rsp, 40
        xchg    eax, DWORD PTR MyClass::copy_count[rip]
        mov     edi, 32000
        call    operator new(unsigned long)
        lea     rbp, [rax+32000]
        mov     r12, rax
        mov     rbx, rax
.L13:
        lea     rax, [rbx+16]
        mov     edx, 120
        mov     esi, 100
        mov     QWORD PTR [rbx], rax
        mov     rdi, rbx
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
        add     rbx, 32
        cmp     rbx, rbp
        jne     .L13
        mov     edx, 29
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     edi, OFFSET FLAT:std::cout
        mov     esi, DWORD PTR MyClass::copy_count[rip]
        call    std::ostream::operator<<(int)
        mov     edx, 1
        mov     esi, OFFSET FLAT:.LC1
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        cmp     rbx, r12
        je      .L16
        mov     rbp, r12
.L18:
        mov     rdi, QWORD PTR [rbp+0]
        lea     rax, [rbp+16]
        cmp     rdi, rax
        je      .L17
        mov     rax, QWORD PTR [rbp+16]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L17:
        add     rbp, 32
        cmp     rbx, rbp
        jne     .L18
.L16:
        add     rsp, 40
        mov     rdi, r12
        mov     esi, 32000
        pop     rbx
        pop     rbp
        pop     r12
        pop     r14
        jmp     operator delete(void*, unsigned long)
        mov     r14, rax
        jmp     .L11
        mov     r14, rax
        jmp     .L19
test_prefix() (.cold):
.L11:
        mov     rbp, r12
.L12:
        cmp     rbp, rbx
        je      .L27
        mov     rdi, rbp
        add     rbp, 32
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_dispose()
        jmp     .L12
.L19:
        movq    xmm1, rbx
        mov     rdi, rsp
        movq    xmm0, r12
        mov     QWORD PTR [rsp+16], rbp
        punpcklqdq      xmm0, xmm1
        movaps  XMMWORD PTR [rsp], xmm0
        call    std::vector<MyClass, std::allocator<MyClass>>::~vector() [complete object destructor]
        mov     rdi, r14
        call    _Unwind_Resume
.L27:
        mov     rdi, r12
        mov     esi, 32000
        call    operator delete(void*, unsigned long)
        mov     rdi, r14
        call    _Unwind_Resume
.LC3:
        .string "Postfix increment copy count: "
test_postfix():
        push    r14
        xor     eax, eax
        push    r12
        push    rbp
        push    rbx
        sub     rsp, 40
        xchg    eax, DWORD PTR MyClass::copy_count[rip]
        mov     edi, 32000
        call    operator new(unsigned long)
        lea     rbp, [rax+32000]
        mov     r12, rax
        mov     rbx, rax
.L32:
        lea     rax, [rbx+16]
        mov     edx, 120
        mov     esi, 100
        mov     QWORD PTR [rbx], rax
        mov     rdi, rbx
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
        add     rbx, 32
        cmp     rbx, rbp
        jne     .L32
        mov     edx, 30
        mov     esi, OFFSET FLAT:.LC3
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        mov     edi, OFFSET FLAT:std::cout
        mov     esi, DWORD PTR MyClass::copy_count[rip]
        call    std::ostream::operator<<(int)
        mov     edx, 1
        mov     esi, OFFSET FLAT:.LC1
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char>>& std::__ostream_insert<char, std::char_traits<char>>(std::basic_ostream<char, std::char_traits<char>>&, char const*, long)
        cmp     rbx, r12
        je      .L35
        mov     rbp, r12
.L37:
        mov     rdi, QWORD PTR [rbp+0]
        lea     rax, [rbp+16]
        cmp     rdi, rax
        je      .L36
        mov     rax, QWORD PTR [rbp+16]
        lea     rsi, [rax+1]
        call    operator delete(void*, unsigned long)
.L36:
        add     rbp, 32
        cmp     rbx, rbp
        jne     .L37
.L35:
        add     rsp, 40
        mov     rdi, r12
        mov     esi, 32000
        pop     rbx
        pop     rbp
        pop     r12
        pop     r14
        jmp     operator delete(void*, unsigned long)
        mov     r14, rax
        jmp     .L30
        mov     r14, rax
        jmp     .L38
test_postfix() (.cold):
.L30:
        mov     rbp, r12
.L31:
        cmp     rbp, rbx
        je      .L46
        mov     rdi, rbp
        add     rbp, 32
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_dispose()
        jmp     .L31
.L38:
        movq    xmm1, rbx
        mov     rdi, rsp
        movq    xmm0, r12
        mov     QWORD PTR [rsp+16], rbp
        punpcklqdq      xmm0, xmm1
        movaps  XMMWORD PTR [rsp], xmm0
        call    std::vector<MyClass, std::allocator<MyClass>>::~vector() [complete object destructor]
        mov     rdi, r14
        call    _Unwind_Resume
.L46:
        mov     rdi, r12
        mov     esi, 32000
        call    operator delete(void*, unsigned long)
        mov     rdi, r14
        call    _Unwind_Resume
main:
        sub     rsp, 8
        call    test_prefix()
        call    test_postfix()
        xor     eax, eax
        add     rsp, 8
        ret
MyClass::copy_count:
        .zero   4

AI分析：

让我们详细对比 test_prefix() 和 test_postfix() 在 vector 版本中的汇编代码：

函数入口和内存分配部分完全相同：

push    r14
xor     eax, eax
push    r12
push    rbp
push    rbx
sub     rsp, 40
xchg    eax, DWORD PTR MyClass::copy_count[rip]
mov     edi, 32000
call    operator new(unsigned long)

初始化循环部分完全相同：

.L13/.L32:
        lea     rax, [rbx+16]
        mov     edx, 120
        mov     esi, 100
        mov     QWORD PTR [rbx], rax
        mov     rdi, rbx
        call    std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>::_M_construct(unsigned long, char)
        add     rbx, 32
        cmp     rbx, rbp
        jne     .L13/.L32

清理部分完全相同：

add     rsp, 40
mov     rdi, r12
mov     esi, 32000
pop     rbx
pop     rbp
pop     r12
pop     r14
jmp     operator delete(void*, unsigned long)

错误处理部分也完全相同，只是标签名不同。

唯一的区别仍然只是：

标签命名（如 .L13 vs .L32）
输出字符串内容（”Prefix increment copy count: “ vs “Postfix increment copy count: “）
输出字符串长度（29 vs 30）
结论：
与 list 的情况类似，在 vector 中前缀++和后缀++也被编译器优化成了完全相同的代码
这是因为：

vector 的迭代器本质上是指针
循环中没有使用迭代器的返回值
编译器识别出这种模式后，生成了最优的代码
由于是连续内存，迭代器递增就是简单的指针加法

所以，无论是 list 还是 vector，在现代编译器的优化下，前缀++和后缀++在这种简单遍历场景下并没有性能差异。但如果在更复杂的表达式中使用迭代器的返回值，或者是自定义的迭代器类型，可能还是会有区别。

blog

#C++

前缀和后缀++的性能差异

http://example.com/2025/07/07/front_and_end_plus/

作者

icyyoung

发布于

2025年7月7日

许可协议

Virtual——潜在的性能杀手下一篇