Performance of graph search

from Red Blob Games
28 Jul 2014

Table of Contents

#Measure performance per node not per graph

The problem is that we don’t know the sizes of the graphs; is the performance per node reasonably consistent? Don’t know. I’d expect 100-1000 nodes per millisecond.

#Python

2.1breadth first search#

def breadth_first_search(graph, start, goal):
    frontier = collections.deque()
    frontier.append(start)
    came_from = {}
    came_from[start] = None
    
    while len(frontier) > 0:
        current = frontier.popleft()
        
        if current == goal:
            break
        
        for next in graph.neighbors(current):
            if next not in came_from:
                frontier.append(next)
                came_from[next] = current
    
    return came_from

g = SquareGrid(30, 15)
g.walls = DIAGRAM1_WALLS

import time
def time_it():
    count = 100
    t = time.time()
    for i in range(count):
        breadth_first_search(g, (8, 7), (17, 2))
    print(1000*(time.time() - t)/count, "ms")

time_it()

2.2dijkstra’s algorithm#

def dijkstra_search(graph, start, goal):
    frontier = PriorityQueue()
    frontier.put(start, 0)
    came_from = {}
    cost_so_far = {}
    came_from[start] = None
    cost_so_far[start] = 0
    
    while not frontier.empty():
        current = frontier.get()
        
        if current == goal:
            break
        
        for next in graph.neighbors(current):
            new_cost = cost_so_far[current] + graph.cost(current, next)
            if next not in cost_so_far or new_cost < cost_so_far[next]:
                cost_so_far[next] = new_cost
                priority = new_cost
                frontier.put(next, priority)
                came_from[next] = current
    
    return came_from, cost_so_far

import time
def time_it():
    count = 100
    t = time.time()
    for i in range(count):
        dijkstra_search(diagram4, (1, 4), (7, 8))
    print(1000*(time.time() - t)/count, "ms")

time_it()

#C++

3.1breadth first search#

I extracted this to performance.cpp so I could try it with various compiler optimization flags.

  • Python 2ms
  • clang++ without optimization 1ms
  • clang++ with -O3 0.28ms
#include <time.h>
#include "redblobgames/pathfinding/a-star/implementation.cpp"

template<typename Graph>
unordered_map<typename Graph::Location, typename Graph::Location>
breadth_first_search(Graph graph, typename Graph::Location start) {
  typedef typename Graph::Location Location;
  queue<Location> frontier;
  frontier.push(start);

  unordered_map<Location, Location> came_from;
  came_from[start] = start;

  while (!frontier.empty()) {
    auto current = frontier.front();
    frontier.pop();

    for (auto next : graph.neighbors(current)) {
      if (!came_from.count(next)) {
        frontier.push(next);
        came_from[next] = current;
      }
    }
  }
  return came_from;
}


int main() {
  SquareGrid grid = make_diagram1();
  int count = 1000;
  clock_t t = clock();
  for (int i = 0; i < count; i++) {
    auto parents = breadth_first_search(grid, std::make_tuple(7, 8));
  }
  std::cout << "Avg time per graph search: " << (1000.0*(clock() - t)/count/CLOCKS_PER_SEC) << " ms" << std::endl;
}

3.2breadth first search optimized#

Instead of using a single queue that has to add/remove elements at both ends, use two queues. We add to one queue while removing from the other, then swap them.

#include <time.h>
#include <array>
#include <vector>
#include <iostream>

using namespace std;

struct {int dx,dy;} DIRS[] = {{1, 0}, {0, -1}, {-1, 0}, {0, 1}};

struct SquareGrid {
  typedef uint16_t Location;

  int width, height;
  vector<bool> walls;

  SquareGrid(int width_, int height_)
     : width(width_), height(height_) {
    walls.resize(size());
  }

  int size() const { return 1 + width*height; }
  Location to_id(int x, int y) const { return 1 + x + y*width; }
  int to_x(Location id) const { return (id-1) % width; }
  int to_y(Location id) const { return (id-1) / width; }
  bool in_bounds(Location id) const { return 1 <= id && id <= size(); }
  bool passable(Location id) const { return !walls[id]; }
  
  void neighbors(Location id, vector<Location>& neighbors) const {
    int x = to_x(id), y = to_y(id);
    for (int i = 0; i < 4; i++) {
      Location next = to_id(x + DIRS[i].dx, y + DIRS[i].dy);
      if (in_bounds(next) && passable(next)) {
        neighbors.push_back(next);
      }
    }
  }
};

void add_rect(SquareGrid& grid, int x1, int y1, int x2, int y2) {
  for (int x = x1; x < x2; ++x) {
    for (int y = y1; y < y2; ++y) {
      int id = grid.to_id(x, y);
      grid.walls[id] = true;
    }
  }
}

SquareGrid make_diagram1() {
  SquareGrid grid(30, 15);
  add_rect(grid, 3, 3, 5, 12);
  add_rect(grid, 13, 4, 15, 15);
  add_rect(grid, 21, 0, 23, 7);
  add_rect(grid, 23, 5, 26, 7);
  return grid;
}

template<typename Graph>
vector<typename Graph::Location>
breadth_first_search(Graph graph, typename Graph::Location start) {
  typedef typename Graph::Location Location;
  vector<Location> frontier_curr;
  vector<Location> frontier_next;
  frontier_curr.push_back(start);

  vector<Location> came_from;
  vector<Location> neighbors;
  came_from.resize(graph.size());
  came_from[start] = start;

  while(!frontier_curr.empty()) {
    for (const auto& current : frontier_curr) {
      neighbors.clear();
      graph.neighbors(current, neighbors);
      for (const auto& next : neighbors) {
        if (came_from[next] == 0) {
          frontier_next.push_back(next);
          came_from[next] = current;
        }
      }
    }
    frontier_curr.clear();
    swap(frontier_curr, frontier_next);
  }
  return came_from;
}


int main() {
  SquareGrid grid = make_diagram1();
  int count = 1000;
  clock_t t = clock();
  for (int i = 0; i < count; i++) {
    auto parents = breadth_first_search(grid, grid.to_id(7, 8));
  }
  cout << "Avg time per graph search: " << (1000.0*(clock() - t)/count/CLOCKS_PER_SEC) << " ms\n";
}

When using the double-loop we get 0.315 ms

When using int id instead of tuple<int,int> and a vector<int> instead of unordered_map<tuple,tuple> we get 0.191 ms

It’d also be faster if neighbors() filled an array instead of allocating a new one, and if it were int ids instead of tuples –> 0.013661 ms

It’d likely be even faster if we don’t need the full array but just a path, because we can use an uninitialized came_from and only initialize a visited bitvector

c++ -std=c++14 -I ~/Sites -O3 performance-2.cpp -o /tmp/a.out && time /tmp/a.out
instruments -t "Time Profiler" -D /tmp/trace.trace /tmp/a.out

Email me at , or tweet to @redblobgames, or post a public comment: