7 min de leitura ·

Estudo interessante de C++ (AOT) vs Java (JIT) com resultados favoráveis ao JIT

Projeto GitHub com resultados e mais detalhes => https://github.com/coralblocks/CoralBench?tab=readme-ov-file#hotspotvm--xcomp-and-jit-vs-c-llvm-clang-vs-graalvm

Diante dos detalhes abaixo, alguém tem alguma ideia do motivo pelo qual a versão em C++ do benchmark foi mais lenta que a versão em Java? Será que algo passou despercebido ou existe algum aspecto da implementação em C++ que poderia ser otimizado ainda mais? Talvez opções específicas de otimização do Clang que deveríamos explorar?

O objetivo desta pesquisa é explorar as diferenças de desempenho entre as estratégias de compilação JIT (just-in-time) e AOT (ahead-of-time) e entender suas respectivas vantagens e desvantagens. O intuito não é afirmar que uma linguagem é mais lenta ou pior do que a outra.

Em nossos testes, observamos melhores resultados com o HotSpot JVM 23 usando a compilação JIT. Obtivemos resultados mais lentos com C++ (compilado com Clang 18), GraalVM 23 (compilado com native-image) e HotSpot JVM 23 com a flag -Xcomp. Estamos buscando entender por que isso acontece e, se possível, identificar formas de melhorar o desempenho da versão em C++ para igualar os resultados da compilação JIT em Java.

Nosso benchmark envolve a comparação de uma implementação de tabela hash (map) em Java com uma implementação equivalente em C++. Fizemos todo o esforço possível para garantir consistência entre as duas implementações, mas é possível que alguns detalhes tenham sido negligenciados.

A implementação da tabela hash em si é simples, e procuramos tornar o código em Java e C++ o mais equivalente possível, incluindo a forma como a memória é gerenciada. Por exemplo, garantimos que os valores da tabela hash em C++ sejam armazenados por referência, e não por valor, para evitar cópias desnecessárias.

O benchmark cria uma tabela hash com 5.000.000 de buckets e insere 10.000.000 de objetos, minimizando colisões. A busca linear em cada bucket é limitada a um máximo de duas iterações para evitar discrepâncias nas medições devido a comportamentos variados de colisão entre os diferentes elementos.

A classe Bench, que realiza as medições, também foi projetada para ser equivalente nas duas implementações.

To compile and execute:

$ clang++ -Ofast -march=native -flto -std=c++17 -I./src/main/c -c ./src/main/c/int_map.cpp -o ./target/cpp/int_map.o
$ clang++ -Ofast -march=native -flto -std=c++17 -I./src/main/c -c ./src/main/c/bench.cpp -o ./target/cpp/bench.o
$ clang++ -Ofast -march=native -flto -std=c++17 -I./src/main/c -c ./src/main/c/int_map_benchmark.cpp -o ./target/cpp/int_map_benchmark.o
$ clang++ -Ofast -march=native -flto -std=c++17 -o ./target/cpp/int_map_benchmark ./target/cpp/int_map.o ./target/cpp/bench.o ./target/cpp/int_map_benchmark.o

$ ./target/cpp/int_map_benchmark 0 10000000 5000000

bench.hpp:

#ifndef BENCH_HPP
#define BENCH_HPP

#include <chrono>
#include <iostream>
#include <limits>
#include <iomanip>
#include <string>
#include <cmath>
#include <map>
#include <sstream>

class Bench {
public:
    Bench(int warmupCount = 0);
    ~Bench();

    void mark();
    void measure();
    void reset();
    void printResults() const;

private:
    int warmupCount;
    int measurementCount;
    long long sum;
    long long minTime;
    long long maxTime;
    int size;
    std::map<long long, long long>* results;
    std::chrono::steady_clock::time_point startTime;
    
    static std::string formatWithCommas(long long value);
    static std::pair<double, std::string> formatTime(double nanos);
    static std::string formatPercentage(double perc);
    static double roundToDecimals(double d, int decimals);
    void printPercentiles() const;
    void addPercentile(double perc) const;
};

#endif // BENCH_HPP

bench.cpp:

#include "bench.hpp"
using namespace std;

Bench::Bench(int warmupCount)
    : warmupCount(warmupCount),
      measurementCount(0),
      sum(0),
      minTime(numeric_limits<long long>::max()),
      maxTime(numeric_limits<long long>::min()),
      size(0) {

        results = new std::map<long long, long long>();

}

Bench::~Bench() {
    delete results;
}

void Bench::mark() {
    startTime = chrono::steady_clock::now();
}

void Bench::measure() {
    auto endTime = chrono::steady_clock::now();
    auto elapsed = chrono::duration_cast<chrono::nanoseconds>(endTime - startTime).count();
    measurementCount++;

    if (measurementCount > warmupCount) {
        sum += elapsed;
        if (elapsed < minTime) minTime = elapsed;
        if (elapsed > maxTime) maxTime = elapsed;

        // Increment the frequency of this elapsed time
        auto it = results->find(elapsed);
        if (it == results->end()) {
            results->insert({elapsed, 1});
        } else {
            it->second++;
        }
        size++;
    }
}

void Bench::reset() {
    measurementCount = 0;
    sum = 0;
    minTime = numeric_limits<long long>::max();
    maxTime = numeric_limits<long long>::min();
    results->clear();
    size = 0;
}

void Bench::printResults() const {
    int effectiveCount = measurementCount - warmupCount;
    if (effectiveCount <= 0) {
        cout << "Not enough measurements after warmup to report statistics." << endl;
        return;
    }

    double avg = static_cast<double>(sum) / effectiveCount;

    string effCountStr = formatWithCommas(effectiveCount);
    string warmupStr = formatWithCommas(warmupCount);
    string totalStr = formatWithCommas(measurementCount);

    cout << "Measurements: " << effCountStr
         << " | Warm-Up: " << warmupStr
         << " | Iterations: " << totalStr << endl;

    auto [avgVal, avgUnit] = formatTime(avg);
    auto [minVal, minUnit] = formatTime(static_cast<double>(minTime));
    auto [maxVal, maxUnit] = formatTime(static_cast<double>(maxTime));

    cout << fixed << setprecision(3);
    cout << "Avg Time: " << avgVal << " " << avgUnit << " | "
         << "Min Time: " << minVal << " " << minUnit << " | "
         << "Max Time: " << maxVal << " " << maxUnit << endl;

    printPercentiles();
}

string Bench::formatWithCommas(long long value) {
    std::string numStr = std::to_string(value);
    int insertPosition = static_cast<int>(numStr.length()) - 3;
    while (insertPosition > 0) {
        numStr.insert(insertPosition, ",");
        insertPosition -= 3;
    }
    return numStr;
}

pair<double, string> Bench::formatTime(double nanos) {
    if (nanos >= 1'000'000'000.0) {
        double seconds = nanos / 1'000'000'000.0;
        return {roundToDecimals(seconds, 3), seconds > 1 ? "seconds" : "second"};
    } else if (nanos >= 1'000'000.0) {
        double millis = nanos / 1'000'000.0;
        return {roundToDecimals(millis, 3), millis > 1 ? "millis" : "milli"};
    } else if (nanos >= 1000.0) {
        double micros = nanos / 1000.0;
        return {roundToDecimals(micros, 3), micros > 1 ? "micros" : "micro"};
    } else {
        double ns = nanos;
        return {roundToDecimals(ns, 3), ns > 1 ? "nanos" : "nano"};
    }
}

double Bench::roundToDecimals(double d, int decimals) {
    double pow10 = std::pow(10.0, decimals);
    return std::round(d * pow10) / pow10;
}

void Bench::printPercentiles() const {

    if (size == 0) return;

    double percentiles[] = {0.75, 0.90, 0.99, 0.999, 0.9999, 0.99999};

    for (double p : percentiles) {
        addPercentile(p);
    }
}

std::string Bench::formatPercentage(double perc) {
    double p = perc * 100.0;

    std::ostringstream oss;
    oss << std::fixed << std::setprecision(6) << p;

    std::string s = oss.str();
    // remove trailing zeros
    while (s.back() == '0') {
        s.pop_back();
    }

    // if the last character is now a '.', remove it
    if (s.back() == '.') {
        s.pop_back();
    }

    // Append the '%' sign
    s += "%";

    return s;
}

void Bench::addPercentile(double perc) const {

    if (results->empty()) return;

    long long target = static_cast<long long>(std::llround(perc * size));
    if (target == 0) return;
    if (target > size) target = size;

    // Iterate through the map to find the top element at position target
    long long iTop = 0;
    long long sumTop = 0;
    long long maxTop = -1;

    for (auto &entry : *results) {
        long long time = entry.first;
        long long count = entry.second;

        for (int i = 0; i < count; i++) {
            iTop++;
            sumTop += time;
            if (iTop == target) {
                maxTop = time;
                goto FOUND;
            }
        }
    }

FOUND:;

    double avgTop = static_cast<double>(sumTop) / iTop;
    auto [avgVal, avgUnit] = formatTime(avgTop);
    auto [maxVal, maxUnit] = formatTime(static_cast<double>(maxTop));

    cout << fixed << setprecision(3);
    cout << formatPercentage(perc) << " = [avg: " << avgVal << " " << avgUnit
         << ", max: " << maxVal << " " << maxUnit << "]\n";
}

int_map.hpp:

#ifndef INT_MAP_HPP
#define INT_MAP_HPP

#include <optional>
#include <cstddef>

template <typename E>
class IntMap {

private:

    template <typename T>
    struct Entry {
        int key;
        T* value;
        Entry<T>* next;
    };

    Entry<E>** data;
    int count;
    Entry<E>* head;
    const int capacity;

    Entry<E>* newEntry(int key, const E& value, Entry<E>* next) {
        Entry<E>* newEntry = head;

        if (newEntry != nullptr) {
            head = newEntry->next;
        } else {
            newEntry = new Entry<E>();
        }

        newEntry->key = key;
        newEntry->value = const_cast<E*>(&value);
        newEntry->next = next;

        return newEntry;
    }

    void free(Entry<E>* e) {
        e->value = nullptr;
        e->next = head;
        head = e;
    }

    int toArrayIndex(int key) const {
        return (key & 0x7FFFFFFF) % capacity;
    }

public:

    IntMap(int capacity)
        : capacity(capacity), count(0), head(nullptr) {
        data = new Entry<E>*[capacity];
        for (int i = 0; i < capacity; i++) {
            data[i] = nullptr;
        }
    }

    ~IntMap() {
        clear();

        while (head != nullptr) {
            Entry<E>* temp = head;
            head = head->next;
            delete temp;
        }

        delete[] data;
    }

    int size() const {
        return count;
    }

    E* get(int key) const {
        Entry<E>* e = data[toArrayIndex(key)];
        while (e != nullptr) {
            if (e->key == key) {
                return e->value;
            }
            e = e->next;
        }
        return nullptr;
    }

    E* put(int key, const E& value) {
        int index = toArrayIndex(key);
        Entry<E>* e = data[index];

        while (e != nullptr) {
            if (e->key == key) {
                E* old = e->value;
                e->value = const_cast<E*>(&value);
                return old;
            }
            e = e->next;
        }

        data[index] = newEntry(key, value, data[index]);
        count++;
        return nullptr;
    }

    E* remove(int key) {
        int index = toArrayIndex(key);
        Entry<E>* e = data[index];
        Entry<E>* prev = nullptr;

        while (e != nullptr) {
            if (e->key == key) {
                if (prev != nullptr) {
                    prev->next = e->next;
                } else {
                    data[index] = e->next;
                }

                E* oldValue = e->value;
                free(e);
                count--;
                return oldValue;
            }
            prev = e;
            e = e->next;
        }
        return nullptr;
    }

    void clear() {
        for (int index = capacity - 1; index >= 0; index--) {
            while (data[index] != nullptr) {
                Entry<E>* next = data[index]->next;
                free(data[index]);
                data[index] = next;
            }
        }
        count = 0;
    }
};

#endif // INT_MAP_HPP

int_map.cpp:

#include "int_map.hpp"

// NOOP

int_map_benchmark.cpp:

#include "int_map.hpp"
#include "bench.hpp"
#include <iostream>

using namespace std;

struct Dummy {};

int main(int argc, char* argv[]) {

    int warmupCount = 1000000;
    int measureCount = 1000000;
    int capacity = 100000;

    if (argc > 1) {
        warmupCount = atoi(argv[1]);
    }
    if (argc > 2) {
        measureCount = atoi(argv[2]);
    }
    if (argc > 3) {
        capacity = atoi(argv[3]);
    }

    cout << "\nArguments: warmup=" << warmupCount << " measurements=" << measureCount << " mapCapacity=" << capacity << endl << endl;

    int iterations = warmupCount + measureCount;

    IntMap<Dummy>* map = new IntMap<Dummy>(capacity);
    Bench bench(warmupCount);
    Dummy dummy;

    cout << "Benchmarking put..." << endl;
    bench.reset();
    for (int i = 0; i < iterations; i++) {
        bench.mark();
        map->put(i, dummy);
        bench.measure();
    }
    bench.printResults();
    cout << endl;

    cout << "Benchmarking get..." << endl;
    bench.reset();
    for (int i = 0; i < iterations; i++) {
        bench.mark();
        volatile auto val = map->get(i); // volatile to prevent optimization out
        bench.measure();
    }
    bench.printResults();
    cout << endl;

    cout << "Benchmarking remove..." << endl;
    bench.reset();
    for (int i = 0; i < iterations; i++) {
        bench.mark();
        map->remove(i);
        bench.measure();
    }
    bench.printResults();
    cout << endl;

    delete map; 
    return 0;
}

Results:

Benchmarking put...
Measurements: 10,000,000 | Warm-Up: 0 | Iterations: 10,000,000
Avg Time: 38.597 nanos | Min Time: 30.000 nanos | Max Time: 31.107 micros
75% = [avg: 32.235 nanos, max: 33.000 nanos]
90% = [avg: 32.501 nanos, max: 35.000 nanos]
99% = [avg: 33.093 nanos, max: 106.000 nanos]
99.9% = [avg: 37.288 nanos, max: 884.000 nanos]
99.99% = [avg: 38.177 nanos, max: 1.316 micros]
99.999% = [avg: 38.438 nanos, max: 15.270 micros]

Benchmarking get...
Measurements: 10,000,000 | Warm-Up: 0 | Iterations: 10,000,000
Avg Time: 24.109 nanos | Min Time: 19.000 nanos | Max Time: 15.886 micros
75% = [avg: 21.578 nanos, max: 25.000 nanos]
90% = [avg: 22.200 nanos, max: 26.000 nanos]
99% = [avg: 22.978 nanos, max: 91.000 nanos]
99.9% = [avg: 23.639 nanos, max: 160.000 nanos]
99.99% = [avg: 23.884 nanos, max: 432.000 nanos]
99.999% = [avg: 23.963 nanos, max: 13.451 micros]

Benchmarking remove...
Measurements: 10,000,000 | Warm-Up: 0 | Iterations: 10,000,000
Avg Time: 24.279 nanos | Min Time: 19.000 nanos | Max Time: 29.132 micros
75% = [avg: 21.899 nanos, max: 24.000 nanos]
90% = [avg: 22.426 nanos, max: 26.000 nanos]
99% = [avg: 23.140 nanos, max: 91.000 nanos]
99.9% = [avg: 23.798 nanos, max: 153.000 nanos]
99.99% = [avg: 24.017 nanos, max: 426.000 nanos]
99.999% = [avg: 24.125 nanos, max: 13.548 micros]