在互联网广告投放、负载均衡、A/B测试等场景中,我们经常需要按照预设权重从多个选项中进行选择。本文将深入探讨三种不同的加权随机选择算法,并通过大规模模拟实验对比它们的性能表现。
问题背景
假设我们有4个广告,权重分别为2、3、4、5,需要每次展示2个不同的广告。看似简单的需求,实际上涉及到几个关键问题:
- 如何确保长期展示比例符合权重设定?
- 如何处理”不重复选择”带来的概率分布变化?
- 在需要结果可重现的场景下,如何实现确定性选择?
理论分布的计算
简单权重 vs 不重复选择
首先需要理解一个重要概念:当我们进行不重复选择时,实际的概率分布会与简单权重比例有所不同。
简单权重比例(假设每次选择独立):
- Ad0: 2/14 = 14.29%
- Ad1: 3/14 = 21.43%
- Ad2: 4/14 = 28.57%
- Ad3: 5/14 = 35.71%
考虑不重复选择的实际分布:
- Ad0: 15.92%
- Ad1: 22.74%
- Ad2: 28.50%
- Ad3: 32.85%
为什么会有差异?以Ad0为例:
- 第一位被选中:2/14 ≈ 14.29%
- 第二位被选中:需要考虑其他广告先被选中的情况
- 如果Ad3先被选中(概率5/14),Ad0被选的概率变为2/9 ≈ 22.2%
这种”补偿效应”使得权重较小的广告实际展示比例略有提升。
三种算法实现
1. 真随机算法
使用硬件随机数生成器(std::random_device
),每次运行产生不同结果:
template<typename Generator>
std::vector<int> selectAdsWithGenerator(int count, Generator& gen) {
std::vector<int> selected;
std::vector<bool> used(ads.size(), false);
for (int i = 0; i < count && i < ads.size(); ++i) {
// 计算剩余权重
int remaining_weight = 0;
for (int j = 0; j < ads.size(); ++j) {
if (!used[j]) remaining_weight += ads[j].weight;
}
// 轮盘赌选择
std::uniform_int_distribution<> dis(1, remaining_weight);
int random_value = dis(gen);
int cumulative = 0;
for (int j = 0; j < ads.size(); ++j) {
if (!used[j]) {
cumulative += ads[j].weight;
if (random_value <= cumulative) {
selected.push_back(j);
used[j] = true;
break;
}
}
}
}
return selected;
}
2. 伪随机算法
使用固定种子的随机数生成器,保证结果可重现:
std::mt19937 gen_pseudo(12345); // 固定种子
// 基于随机数的广告选择器(包含真随机和伪随机)
class AdSelector {
private:
struct Ad {
int id;
int weight;
};
std::vector<Ad> ads;
int total_weight;
// 真随机数生成器
std::random_device rd;
std::mt19937 gen_random;
// 伪随机数生成器(固定种子)
std::mt19937 gen_pseudo;
public:
AdSelector(const std::vector<int>& weights)
: gen_random(rd()), gen_pseudo(12345), total_weight(0) {
for (int i = 0; i < weights.size(); ++i) {
ads.push_back({i, weights[i]});
total_weight += weights[i];
}
}
// 真随机版本
std::vector<int> selectAdsRandom(int count) {
return selectAdsWithGenerator(count, gen_random);
}
// 伪随机版本
std::vector<int> selectAdsPseudo(int count) {
return selectAdsWithGenerator(count, gen_pseudo);
}
// 计算理论展示分布(考虑不重复选择)
std::vector<double> calculateTheoreticalDistribution(int select_count) {
std::vector<double> distribution(ads.size(), 0.0);
if (select_count == 2) {
for (int i = 0; i < ads.size(); ++i) {
// 第一位被选中的概率
double prob_first = (double)ads[i].weight / total_weight;
// 第二位被选中的概率
double prob_second = 0.0;
for (int j = 0; j < ads.size(); ++j) {
if (i != j) {
double remaining_weight = total_weight - ads[j].weight;
prob_second += ((double)ads[j].weight / total_weight) *
((double)ads[i].weight / remaining_weight);
}
}
distribution[i] = (prob_first + prob_second) / select_count;
}
}
return distribution;
}
private:
template<typename Generator>
std::vector<int> selectAdsWithGenerator(int count, Generator& gen) {
std::vector<int> selected;
std::vector<bool> used(ads.size(), false);
for (int i = 0; i < count && i < ads.size(); ++i) {
int remaining_weight = 0;
for (int j = 0; j < ads.size(); ++j) {
if (!used[j]) {
remaining_weight += ads[j].weight;
}
}
std::uniform_int_distribution<> dis(1, remaining_weight);
int random_value = dis(gen);
int cumulative = 0;
for (int j = 0; j < ads.size(); ++j) {
if (!used[j]) {
cumulative += ads[j].weight;
if (random_value <= cumulative) {
selected.push_back(j);
used[j] = true;
break;
}
}
}
}
return selected;
}
};
3. 补偿算法
完全确定性的算法,基于补偿机制:
// 改进的权重补偿算法(考虑不重复选择)
class WeightCompensationSelector {
private:
struct Ad {
int id;
int weight;
double credit;
int show_count;
Ad(int i, int w) : id(i), weight(w), credit(0.0), show_count(0) {}
};
std::vector<Ad> ads;
std::vector<double> target_distribution; // 目标分布(考虑不重复选择)
int total_selections;
public:
WeightCompensationSelector(const std::vector<int>& weights)
: total_selections(0) {
// 初始化广告
int total_weight = 0;
for (int i = 0; i < weights.size(); ++i) {
ads.emplace_back(i, weights[i]);
total_weight += weights[i];
}
// 计算目标分布(考虑不重复选择)
target_distribution.resize(ads.size());
for (int i = 0; i < ads.size(); ++i) {
// 第一位被选中的概率
double prob_first = (double)ads[i].weight / total_weight;
// 第二位被选中的概率
double prob_second = 0.0;
for (int j = 0; j < ads.size(); ++j) {
if (i != j) {
double remaining_weight = total_weight - ads[j].weight;
prob_second += ((double)ads[j].weight / total_weight) *
((double)ads[i].weight / remaining_weight);
}
}
// 占总展示的比例
target_distribution[i] = (prob_first + prob_second) / 2.0;
}
}
std::vector<int> selectAds(int count) {
std::vector<int> selected;
std::vector<bool> used(ads.size(), false);
// 计算每个广告的期望展示次数和实际展示次数的差距
for (auto& ad : ads) {
double expected_shows = target_distribution[ad.id] * total_selections * 2;
double actual_shows = ad.show_count;
ad.credit = expected_shows - actual_shows + target_distribution[ad.id] * count;
}
// 选择信用值最高的广告
for (int i = 0; i < count && i < ads.size(); ++i) {
int best_ad = -1;
double best_credit = -1.0;
for (int j = 0; j < ads.size(); ++j) {
if (!used[j] && ads[j].credit > best_credit) {
best_credit = ads[j].credit;
best_ad = j;
}
}
if (best_ad != -1) {
selected.push_back(best_ad);
used[best_ad] = true;
ads[best_ad].show_count++;
}
}
total_selections++;
return selected;
}
void reset() {
for (auto& ad : ads) {
ad.credit = 0.0;
ad.show_count = 0;
}
total_selections = 0;
}
};
实验结果分析
通过10万次模拟(总计20万次广告展示),我们得到了以下结果:
// 主比较函数
void compareThreeMethods(int simulation_count = 100000) {
std::vector<int> weights = {2, 3, 4, 5};
AdSelector selector(weights);
WeightCompensationSelector compensator(weights);
// 统计结果
std::map<int, int> random_stats, pseudo_stats, compensation_stats;
// 初始化
for (int i = 0; i < weights.size(); ++i) {
random_stats[i] = 0;
pseudo_stats[i] = 0;
compensation_stats[i] = 0;
}
// 大规模模拟
compensator.reset();
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < simulation_count; ++i) {
// 真随机
auto random_selected = selector.selectAdsRandom(2);
for (int ad : random_selected) {
random_stats[ad]++;
}
// 伪随机
auto pseudo_selected = selector.selectAdsPseudo(2);
for (int ad : pseudo_selected) {
pseudo_stats[ad]++;
}
// 补偿算法
auto comp_selected = compensator.selectAds(2);
for (int ad : comp_selected) {
compensation_stats[ad]++;
}
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
// 计算理论分布
auto theoretical_dist = selector.calculateTheoreticalDistribution(2);
// 输出结果
std::cout << "\n\n====== 大规模模拟结果 ======\n";
std::cout << "模拟次数: " << simulation_count << "\n";
std::cout << "总展示次数: " << simulation_count * 2 << "\n";
std::cout << "总耗时: " << duration << " ms\n\n";
int total_shows = simulation_count * 2;
// 显示理论分布
std::cout << "理论分布(考虑不重复选择):\n";
for (int i = 0; i < weights.size(); ++i) {
std::cout << " Ad" << i << ": " << std::fixed << std::setprecision(4)
<< theoretical_dist[i] * 100 << "%\n";
}
// 输出三种方法的结果对比
std::cout << "\n====== 实际结果对比 ======\n";
std::cout << std::setw(10) << "广告"
<< std::setw(15) << "理论值"
<< std::setw(15) << "真随机"
<< std::setw(15) << "伪随机"
<< std::setw(15) << "补偿算法\n";
std::cout << std::string(70, '-') << "\n";
for (int i = 0; i < weights.size(); ++i) {
double theory_pct = theoretical_dist[i] * 100;
double random_pct = (double)random_stats[i] / total_shows * 100;
double pseudo_pct = (double)pseudo_stats[i] / total_shows * 100;
double comp_pct = (double)compensation_stats[i] / total_shows * 100;
std::cout << std::setw(10) << ("Ad" + std::to_string(i))
<< std::setw(14) << std::fixed << std::setprecision(2) << theory_pct << "%"
<< std::setw(14) << random_pct << "%"
<< std::setw(14) << pseudo_pct << "%"
<< std::setw(14) << comp_pct << "%\n";
}
// 输出偏差分析
std::cout << "\n====== 偏差分析(与理论值的绝对偏差)======\n";
std::cout << std::setw(10) << "广告"
<< std::setw(15) << "真随机"
<< std::setw(15) << "伪随机"
<< std::setw(15) << "补偿算法\n";
std::cout << std::string(55, '-') << "\n";
double total_dev_random = 0, total_dev_pseudo = 0, total_dev_comp = 0;
for (int i = 0; i < weights.size(); ++i) {
double theory_pct = theoretical_dist[i] * 100;
double random_pct = (double)random_stats[i] / total_shows * 100;
double pseudo_pct = (double)pseudo_stats[i] / total_shows * 100;
double comp_pct = (double)compensation_stats[i] / total_shows * 100;
double dev_random = std::abs(random_pct - theory_pct);
double dev_pseudo = std::abs(pseudo_pct - theory_pct);
double dev_comp = std::abs(comp_pct - theory_pct);
total_dev_random += dev_random;
total_dev_pseudo += dev_pseudo;
total_dev_comp += dev_comp;
std::cout << std::setw(10) << ("Ad" + std::to_string(i))
<< std::setw(14) << std::fixed << std::setprecision(3) << dev_random << "%"
<< std::setw(14) << dev_pseudo << "%"
<< std::setw(14) << dev_comp << "%\n";
}
std::cout << std::string(55, '-') << "\n";
std::cout << std::setw(10) << "总偏差"
<< std::setw(14) << std::fixed << std::setprecision(3) << total_dev_random << "%"
<< std::setw(14) << total_dev_pseudo << "%"
<< std::setw(14) << total_dev_comp << "%\n";
}
准确性对比
广告 | 理论值 | 真随机 | 伪随机 | 补偿算法 |
---|---|---|---|---|
Ad0 | 15.92% | 15.87% | 15.82% | 15.92% |
Ad1 | 22.74% | 22.77% | 22.81% | 22.74% |
Ad2 | 28.50% | 28.63% | 28.57% | 28.50% |
Ad3 | 32.85% | 32.73% | 32.80% | 32.85% |
偏差分析
算法 | 总偏差 | 最大单项偏差 |
---|---|---|
真随机 | 0.335% | 0.133% |
伪随机 | 0.289% | 0.094% |
补偿算法 | 0.001% | 0.000% |
算法选择建议
真随机算法
- 优点:真正的随机性,适合需要公平性的场景
- 缺点:结果不可重现,调试困难
- 适用场景:线上广告投放、抽奖活动
伪随机算法
- 优点:结果可重现,便于调试和测试
- 缺点:需要管理种子,可能被预测
- 适用场景:A/B测试、需要复现的实验
补偿算法
- 优点:
- 完全确定性,结果稳定
- 长期偏差极小(接近0)
- 不依赖随机数生成器
- 缺点:
- 初期可能有较大偏差
- 需要维护历史状态
- 适用场景:需要严格控制分布的场景、分布式系统