Open
Description
With great interest I read the following blog post:
https://lpython.org/blog/2023/07/lpython-novel-fast-retargetable-python-compiler/
But to my surprise I discovered the C++ code in the benchmark of dijkstra_shortest_path
is pessimized rather than optimized.
The usage of maps makes this Code much, much slower as it could.
On my system (some ancient i7 laptop), the following is more than 40 times faster (see below) than the original code.
#include <iostream>
#include <vector>
int32_t dijkstra_shortest_path(int32_t n, int32_t source) {
int32_t i, j, v, u, mindist, alt, dummy, uidx;
std::vector<int32_t> dist, prev;
std::vector<bool> visited;
std::vector<int32_t> Q, graph(n*n);
Q.reserve(n);
dist.reserve(n);
prev.reserve(n);
for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
graph[n * i + j] = std::abs(i - j);
}
}
for(v = 0; v < n; v++) {
dist.push_back(2147483647);
prev.push_back(-1);
Q.push_back(v);
visited.push_back(false);
}
dist[source] = 0;
while(Q.size() > 0) {
u = -1;
mindist = 2147483647;
for(i = 0; i < Q.size(); i++) {
if( mindist > dist[Q[i]] ) {
mindist = dist[Q[i]];
u = Q[i];
uidx = i;
}
}
Q.erase(Q.begin() + uidx);
visited[u] = true;
for(v = 0; v < n; v++) {
if( v != u and not visited[v] ) {
alt = dist[u] + graph[n * u + v];
if( alt < dist[v] ) {
dist[v] = alt;
prev[v] = u;
}
}
}
}
int32_t dist_sum = 0;
for(i = 0; i < n; i++) {
dist_sum += dist[i];
}
return dist_sum;
}
int main() {
int32_t n = 4000;
int32_t dist_sum = dijkstra_shortest_path(n, 0);
std::cout<<dist_sum<<std::endl;
return 0;
}
For convenience, this is the diff:
--- a.cpp 2023-07-30 10:59:17.482487750 +0200
+++ f.cpp 2023-07-30 14:57:58.543699027 +0200
@@ -1,13 +1,15 @@
#include <iostream>
-#include <unordered_map>
#include <vector>
int32_t dijkstra_shortest_path(int32_t n, int32_t source) {
int32_t i, j, v, u, mindist, alt, dummy, uidx;
- std::unordered_map<int32_t, int32_t> graph, dist, prev;
- std::unordered_map<int32_t, bool> visited;
- std::vector<int32_t> Q;
-
+ std::vector<int32_t> dist, prev;
+ std::vector<bool> visited;
+ std::vector<int32_t> Q, graph(n*n);
+ Q.reserve(n);
+ dist.reserve(n);
+ prev.reserve(n);
+
for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
graph[n * i + j] = std::abs(i - j);
@@ -15,10 +17,10 @@
}
for(v = 0; v < n; v++) {
- dist[v] = 2147483647;
- prev[v] = -1;
+ dist.push_back(2147483647);
+ prev.push_back(-1);
Q.push_back(v);
- visited[v] = false;
+ visited.push_back(false);
}
dist[source] = 0;
Furthermore, I implemented a rust version for comparison.
fn dijkstra_shortest_path(n: i32, source: i32) -> i32 {
let mut dist = Vec::with_capacity(n as usize);
let mut prev = Vec::with_capacity(n as usize);
let mut visited = Vec::with_capacity(n as usize);
let mut q = Vec::with_capacity((n * n) as usize);
let mut graph = vec![0; (n * n) as usize];
for i in 0..n {
for j in 0..n {
graph[(n * i + j) as usize] = (i - j).abs();
}
}
for v in 0..n {
dist.push(2147483647);
prev.push(-1);
q.push(v);
visited.push(false);
}
dist[source as usize] = 0;
let mut u;
let mut uidx = 0;
let mut mindist;
let mut alt;
while q.len() > 0 {
u = -1;
mindist = 2147483647;
for i in 0..q.len() {
let d = dist[q[i] as usize];
if mindist > d {
mindist = d;
u = q[i];
uidx = i;
}
}
q.remove(uidx);
visited[u as usize] = true;
for v in 0..n {
if v != u && !visited[v as usize] {
alt = dist[u as usize] + graph[(n * u + v) as usize];
if alt < dist[v as usize] {
dist[v as usize] = alt;
prev[v as usize] = u;
}
}
}
}
dist.iter().sum()
}
fn main() {
let n = 4000;
let dist_sum = dijkstra_shortest_path(n, 0);
println!("{}\n", dist_sum);
}
Benchmark
$ clang++ -std=c++20 -ffast-math -O3 -march=native old.cpp -o old
$ clang++ -std=c++20 -ffast-math -O3 -march=native better.cpp -o better
$ rustc main.rs -C opt-level=3 -o better-rust
$ hyperfine --warmup=20 --min-runs=10 ./old ./better ./better-rust
Benchmark 1: ./old
Time (mean ± σ): 1.690 s ± 0.112 s [User: 1.380 s, System: 0.299 s]
Range (min … max): 1.407 s … 1.790 s 10 runs
Benchmark 2: ./better
Time (mean ± σ): 40.3 ms ± 2.7 ms [User: 31.5 ms, System: 8.2 ms]
Range (min … max): 34.4 ms … 45.8 ms 71 runs
Benchmark 3: ./better-rust
Time (mean ± σ): 40.4 ms ± 1.7 ms [User: 32.8 ms, System: 7.5 ms]
Range (min … max): 37.7 ms … 44.2 ms 68 runs
Summary
./better ran
1.00 ± 0.08 times faster than ./better-rust
41.93 ± 3.97 times faster than ./old
(compilers used: rustc 1.71.0-nightly (2f2c438dc 2023-05-08)
and clang version 15.0.7
on ArchLinux)
Metadata
Metadata
Assignees
Labels
No labels