Algorithm trumps cache locality-big O matters

preamble
Investigating
https://www.geeksforgeeks.org/write-a-c-program-that-given-a-set-a-of-n-numbers-and-another-number-x-determines-whether-or-not-there-exist-two-elements-in-s-whose-sum-is-exactly-x/
I was wondering if cache locality trumps using a hashed set, on a modern CPU.
There are 3 implementations of the search.

  • Naive one – iterate over all possible combinations
  • Sorting – sort vector and random walk vector to find target sum
  • Hash set – using hash set to store previously tested numbers
  •  
    Investigation
    For small sets of data say 100 elements, the random walk pattern looks more successful, as the hashing/inserting in the third solution is taking way too much time.

    --------------------------------------------------
    Benchmark           Time           CPU Iterations
    --------------------------------------------------
    BM_METHOD1      14700 ns      14676 ns      43151
    BM_METHOD2       1540 ns       1539 ns     447113
    BM_METHOD3      23578 ns      23554 ns      29603
    

    For larger sets of data say 1000000, the sorting of the random walk pattern appears to dominate and then the hashing solution is definitely a clear winner.
     
    --------------------------------------------------
    Benchmark           Time           CPU Iterations
    --------------------------------------------------
    BM_METHOD1  108818475 ns  108747000 ns          6 
    BM_METHOD2   66346574 ns   66334000 ns         10
    BM_METHOD3    1583479 ns    1583115 ns        436 
    

    Conclusion
    You need to benchmark if in any doubt. Algorithm always trumps caching on large scale, but on small scale the expense of the algorithm becomes important.

    Full code

    #include <functional>
    #include <iostream>
    #include <vector>
    #include <unordered_set>
    #include <random>
    #include <benchmark/benchmark.h>
    using namespace std;
    
    vector<int>
    GenVec(size_t size_, int valueRange_)
    {
    	vector<int> vec;
    
    	  // create source of randomness, and initialize it with non-deterministic seed
    	  std::random_device r;
    	  std::seed_seq seed{r(), r(), r(), r(), r(), r(), r(), r()};
    	  std::mt19937 eng{seed};
    
    	  // a distribution that takes randomness and produces values in specified range
    	  std::uniform_int_distribution<> dist(-valueRange_, valueRange_);
    
    	for(int i = 0; i < size_; ++i)
    	{
    		vec.emplace_back(dist(eng));
    	}
    
    	return vec;
    }
    
    static vector<int> vec = GenVec(1000000, 1000000);
    static int target = 1600000;
    
    using RetT = pair<int, int>;
    
    //Naive approach - O(n^2)
    RetT method1(const vector<int>& src_, int target_)
    {
    	auto it = src_.begin();
    	auto itEnd = src_.end();
    
    	for(; it != itEnd; ++it)
    	{
    		auto it2 = it;
    		++it2;
    		for(;it2 != itEnd; ++it2)
    		{
    			int sum = *it + *it2;
    			if (sum == target_)
    			{
    				assert(sum == target);
    				return RetT{*it, *it2};
    			}
    		}
    	}
    
    	return RetT{0,0};
    }
    
    //Random walk on sorted vector - sort dominates task - O(n log n)
    RetT method2(const vector<int>& src_, int target_)
    {
    	vector<int> src{src_};
    	sort(src.begin(), src.end());
    
    	auto itS = src.begin();
    	auto itE = src.end();
    	while(itS < itE)
    	{
    		auto sum = *itS + *itE;
    		if (sum == target_)
    		{
    			assert(sum == target);
    			return RetT{*itS, *itE};
    		}
    		else if(sum < target_)
    		{
    			++itS;
    		}
    		else
    		{
    			--itE;
    		}
    	}
    
    	return RetT{0,0};
    }
    
    //Hashing dominates - O(n)
    RetT method3(const vector<int>& src_, int target_)
    {
    	unordered_set<int> s;
    
        for (auto i: src_)
        {
            int temp = target_ - i;
    
            if (temp >= 0 && s.find(temp) != s.end())
            {
            	return RetT{i, temp};
            }
    
            s.insert(i);
        }
    }
    
    
    static void BM_METHOD1(benchmark::State& st_)
    {
    	for (auto _ : st_)
    	{
    	    method1(vec, target);
    	}
    }
    BENCHMARK(BM_METHOD1);
    
    static void BM_METHOD2(benchmark::State& st_)
    {
    	for (auto _ : st_)
    	{
    	    method2(vec, target);
    	}
    }
    BENCHMARK(BM_METHOD2);
    
    static void BM_METHOD3(benchmark::State& st_)
    {
    	for (auto _ : st_)
    	{
    	    method3(vec, target);
    	}
    }
    BENCHMARK(BM_METHOD3);
    
    BENCHMARK_MAIN();
    
    

    Leave a Reply

    Your email address will not be published. Required fields are marked *

    This site uses Akismet to reduce spam. Learn how your comment data is processed.