最快的键值分组

问题描述 投票:0回答:1

给定一组

<key, value>
对,按值将键分组在一起的最先进方法是什么?

#include <vector>
#include <random>
#include <execution>
#include <unordered_map>
#include <iostream>
#include <tbb/concurrent_unordered_map.h>
#include <tbb/concurrent_vector.h>
#include <chrono>


template<typename timetype = std::chrono::microseconds>
struct tiktok
{
  std::vector<std::chrono::time_point<std::chrono::steady_clock> > starts;
  void reset() { starts.reserve(50); starts.resize(0); }
  tiktok() { reset(); }
  std::size_t tik() {  
    starts.emplace_back(std::chrono::steady_clock::now()); 
    return 0; 
  }
  std::size_t tok() { 
    std::size_t rst = std::chrono::duration_cast<timetype> (
      std::chrono::steady_clock::now() - starts.back()).count();
    starts.pop_back();
    return rst;
  }
};


int main()
{
  int NuniqueString = 3e5;
  std::vector<std::string > x(NuniqueString);
  std::mt19937 rng(123);
  for (auto& u: x) { 
    u = std::string(rng() % 1024 + 1, ' ');
    char* c = &u[0];
    for (int i = 0, iend = u.size(); i < iend; ++i)
      c[i] = rng() % 256;
  } 
  
  // The key-value pair definition.
  struct Item { int key; std::string s; };
  std::vector<Item> items(x.size() * 10);
  for (int i = 0, iend = items.size(); i < iend; ++i) { 
    items[i].key = i;
    items[i].s = x[rng() % NuniqueString];
  } 
  auto itemsReserve = items;
  
  // Measure time cost for grouping items' keys, using STL unordered_map
  tiktok timer;
  if constexpr (true) { 
    std::unordered_map<
      std::string, std::vector<int> > H (items.size() * 1.3);
    timer.tik();
    std::for_each(items.begin(), items.end(), [&](auto& i)->void { 
      H[std::move(i.s)].push_back(i.key);
    }); 
    std::cout << "Sequential, use STL unordered_map time cost (ms) = " << 
      timer.tok() << "\n\n";
  } 
  
  // Measure time cost using tbb concurrent unordered_map and concurrent vector.
  if constexpr (true) { 
    items = itemsReserve;
    tbb::concurrent_unordered_map<
      std::string, tbb::concurrent_vector<int>, 
      std::hash<std::string> > H(items.size() * 1.3);
    timer.tik();
    std::for_each( std::execution::par_unseq, items.begin(), 
                   items.end(), [&](auto& i)->void { 
        auto it = H.insert(std::pair(
          std::move(i.s),
          tbb::concurrent_vector<int>()));
        it.first->second.push_back(i.key);
      }); 
    std::cout << "Parallel, use tbb concurrent unordered_map and"
    " concurrent vector time cost (ms) = " << timer.tok() << "\n\n";
  }
}

g++ -std=c++20 groupStrings.cpp -Ofast -march=native -o test -ltbb
在 16 核机器上产生以下结果:

Sequential, use STL unordered_map time cost (ms) = 1700035

Parallel, use tbb concurrent unordered_map and concurrent vector time cost (ms) = 1575196

我需要经常对大量键值对数组执行此类分组。要开始进行我自己的治疗,有没有任何 SOTA 方法可以解决它?

c++ multithreading algorithm hashmap grouping
1个回答
0
投票

问题是字符串不应该移入

tbb::concurrent_unordered_map
:

#include <vector>
#include <random>
#include <execution>
#include <unordered_map>
#include <iostream>
#include <tbb/concurrent_unordered_map.h>
#include <tbb/concurrent_vector.h>
#include <chrono>


template<typename timetype = std::chrono::microseconds>
struct tiktok
{
  std::vector<std::chrono::time_point<std::chrono::steady_clock> > starts;
  void reset() { starts.reserve(50); starts.resize(0); }
  tiktok() { reset(); }
  std::size_t tik() {  
    starts.emplace_back(std::chrono::steady_clock::now()); 
    return 0; 
  }
  std::size_t tok() { 
    std::size_t rst = std::chrono::duration_cast<timetype> (
      std::chrono::steady_clock::now() - starts.back()).count();
    starts.pop_back();
    return rst;
  }
};


int main()
{
  int NuniqueString = 3e5;
  std::vector<std::string > x(NuniqueString);
  std::mt19937 rng(123);
  for (auto& u: x) { 
    u = std::string(rng() % 1024 + 1, ' ');
    char* c = &u[0];
    for (int i = 0, iend = u.size(); i < iend; ++i)
      c[i] = rng() % 256;
  } 
  
  // The key-value pair definition.
  struct Item { int key; std::string s; };
  std::vector<Item> items(x.size() * 10);
  for (int i = 0, iend = items.size(); i < iend; ++i) { 
    items[i].key = i;
    items[i].s = x[rng() % NuniqueString];
  } 
  auto itemsReserve = items;
  
  // Measure time cost for grouping items' keys, using STL unordered_map
  tiktok timer;
  if constexpr (true) { 
    std::unordered_map<
      std::string, std::vector<int> > H (items.size() * 1.3);
    timer.tik();
    std::for_each(items.begin(), items.end(), [&](auto& i)->void { 
      H[std::move(i.s)].push_back(i.key);
    }); 
    std::cout << "Sequential, use STL unordered_map time cost (ms) = " << 
      timer.tok() << "\n\n";
  } 
  
  // Measure time cost using tbb concurrent unordered_map and concurrent vector.
  if constexpr (true) { 
    items = itemsReserve;
    tbb::concurrent_unordered_map<
      std::string, tbb::concurrent_vector<int>, 
      std::hash<std::string> > H(items.size() * 1.3);
    timer.tik();
    std::for_each( std::execution::par_unseq, items.begin(), 
                   items.end(), [&](auto& i)->void { 
        auto it = H.insert(std::pair(
          i.s, // std::move(i.s),
          tbb::concurrent_vector<int>()));
        it.first->second.push_back(i.key);
      }); 
    std::cout << "Parallel, use tbb concurrent unordered_map and"
    " concurrent vector time cost (ms) = " << timer.tok() << "\n\n";
  }
}

这将打印以下时间成本:

Sequential, use STL unordered_map time cost (ms) = 1691195

Parallel, use tbb concurrent unordered_map and concurrent vector time cost (ms) = 423323

4 倍加速还不错。但现在的问题是,为什么移动资源比复制慢得多?我的直觉是,每次调用

std::move()
都需要线程写入
string
的标头(如果我是对的,则为 24 字节块)以设置指向
nullptr
的指针。由于这些标头在内存中是连续的,因此写入将不断攻击同一缓存行(64 字节块)。缓存一致性机制启动并减慢速度。简而言之,虚假共享https://en.wikipedia.org/wiki/False_sharing。如果不是不小心错过了
std::move
,我会浪费更多时间。棘手棘手..

@David Eisenstat 的建议很棒。

parlay::group_by_key()
非常易于使用,并且比
tbb::concurrent_unordered_map
方法快约 1.5 倍。强烈推荐。

© www.soinside.com 2019 - 2024. All rights reserved.