# 蓄水池抽样浅说 (1)

#### 蓄水池

$$P_i(N) = \frac{k}{N}$$

$$P_{N+1}(N+1) = \frac{k}{N+1}$$

\begin{align} P_i(N+1) & = \frac{k}{N} \cdot \left(1 – P_{N+1} \cdot \frac{1}{k}\right)\\ &= \frac{k}{N} \cdot \left(1 – \frac{1}{N+1}\right) \\ &= \frac{k}{N+1} \end{align}

import random
import sys

def reservoirSample(stream, sample_size):
result = []
for index, line in enumerate(stream):
if index < sample_size:
result.append(line)
else:
r = int(random.random() * (index + 1))
if r < sample_size:
result[r] = line
return result

if __name__ == '__main__':
print(reservoirSample(sys.stdin, 10))

#### 换个思路

$$F(x) = \prod_{i=1}^{n-k}\mathbb{P}(u_i \leq x) = x^{n-k}$$

$$P = \int_0^1 (1-x)^k \mathrm{d} F(x) = (n-k)\int_0^1(1-x)^k x^{n-k-1} \mathrm{d} x$$

$$\mathrm{B}(x,y) = \int_0 ^1 t^x(1-t)^y \mathrm{d} t = \frac{x!\,y!}{(x+y+1)!}$$

$$P = (n-k)\,\mathrm{B}(k, n-k-1) = \frac{k!\, (n-k)!}{n!}=\binom{n}{k} ^{-1}$$

import random
import sys
import heapq

def reservoirSample(stream, sample_size):
result = []
for index, line in enumerate(stream):
key = random.random()
if index < sample_size:
heapq.heappush(result, (key, line))
elif key > heapq.nsmallest(1, result)[0][0]:
heapq.heappushpop(result, (key, line))
return list(zip(*result))[1]

if __name__ == '__main__':
print(reservoirSample(sys.stdin, 10))