hacheyz
diff --git a/‎lab2/bfprt_select.py‎
Lines changed: 50 additions & 0 deletions b/‎lab2/bfprt_select.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎lab2/fig/normal.png‎
162 KB b/‎lab2/fig/normal.png‎
162 KB
diff --git a/‎lab2/fig/theta.png‎
154 KB b/‎lab2/fig/theta.png‎
154 KB
diff --git a/‎lab2/fig/uniform.png‎
154 KB b/‎lab2/fig/uniform.png‎
154 KB
diff --git a/‎lab2/fig/zipf.png‎
230 KB b/‎lab2/fig/zipf.png‎
230 KB
diff --git a/‎lab2/gen_data.py‎
Lines changed: 11 additions & 0 deletions b/‎lab2/gen_data.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lab2/lazy_select.py‎
Lines changed: 35 additions & 0 deletions b/‎lab2/lazy_select.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎lab2/main.py‎
Lines changed: 101 additions & 10 deletions b/‎lab2/main.py‎
Lines changed: 101 additions & 10 deletions
diff --git a/‎lab2/sort_select.py‎
Lines changed: 15 additions & 0 deletions b/‎lab2/sort_select.py‎
Lines changed: 15 additions & 0 deletions
@@ -0,0 +1,50 @@
+def median(arr: list) -> int:
+    """返回 arr 的中位数, arr 的长度不超过 5"""
+    assert len(arr) <= 5
+    arr.sort()
+    return arr[len(arr) // 2]
+
+
+def partition(arr: list, MoM: int) -> tuple:
+    """根据 MoM 划分 arr 为三部分：L, E, G"""
+    L, E, G = [], [], []  # less, equal, greater
+    for num in arr:
+        if num < MoM:
+            L.append(num)
+        elif num == MoM:
+            E.append(num)
+        else:
+            G.append(num)
+    return L, E, G
+
+
+def bfprt(arr: list) -> int:
+    """返回 arr 的中位数的中位数"""
+    n = len(arr)
+    if n <= 5:
+        return median(arr)
+    m = n // 5
+    groups = [arr[i * 5:(i + 1) * 5] for i in range(m)]
+    medians = [median(group) for group in groups]
+    return bfprt(medians)
+
+
+def bfprt_select(arr: list, k: int) -> int:
+    """返回 arr 中第 k 小的元素"""
+    # 1. 将 arr 划分为 n//5 组，每组 5 个元素
+    # 2. 对每个组进行排序，找到其中位数
+    # 3. 递归地调用 bfprt_select，找到这些中位数的中位数 MoM
+    # 4. 以 MoM 为基准，划分 arr 为三部分：L, E, G
+    # 5. 根据 k 与 L, E, G 的大小关系，递归地调用 bfprt_select
+    # 6. 返回结果
+
+    if len(arr) <= 5:
+        return sorted(arr)[k]  # 直接返回第 k 小的元素
+    MoM = bfprt(arr)
+    L, E, G = partition(arr, MoM)
+    if k < len(L):
+        return bfprt_select(L, k)
+    elif k < len(L) + len(E):
+        return E[0]
+    else:
+        return bfprt_select(G, k - len(L) - len(E))
@@ -0,0 +1,11 @@
+import numpy as np
+
+
+def gen_data(data_type: str, n: int, iter_num: int) -> (list, list):
+    k_list = [np.random.randint(0, n) for _ in range(iter_num)]
+    if data_type == "uniform":
+        return np.random.uniform(0, 1, n).tolist(), k_list
+    elif data_type == "normal":
+        return np.random.normal(0, 1, n).tolist(), k_list
+    elif data_type == "zipf":
+        return np.random.zipf(2, n).tolist(), k_list
@@ -0,0 +1,35 @@
+import random
+from math import sqrt, floor
+
+
+def rank(arr: list, x: int) -> int:
+    """返回 arr 中小于 x 的元素个数"""
+    return sum(1 for num in arr if num < x)
+
+
+def min_k(sorted_arr: list, k: int) -> int:
+    """返回有序数组 sorted_arr 中第 k 小的元素"""
+    return sorted_arr[k]
+
+
+def lazy_select(arr: list, k: int, theta: float = 3 / 4) -> int:
+    """拉斯维加斯算法，返回 arr 中第 k 小的元素"""
+    n = len(arr)
+    R_len = int(n ** theta)
+    R = random.choices(arr, k=R_len)  # 随机选择 n^(3/4) 个元素
+    R.sort()  # 此排序的时间复杂度为 O(n)
+    x = (k / n) * R_len  # arr 的第 k 小元素可能成为 R 的第 x 小元素
+    l, h = max(floor(x - sqrt(n)), 0), min(floor(x + sqrt(n)), R_len - 1)  # 考察区间 [l, h]
+    L, H = min_k(R, l), min_k(R, h)
+    Lp, Hp = rank(arr, L), rank(arr, H)
+    P = [num for num in arr if L <= num <= H]  # 将 arr 中介于 L, H 之间的元素放入 P
+
+    if Lp <= k <= Hp and len(P) <= 4 * n ** theta + 1:
+        P.sort()
+        return min_k(P, k - Lp)
+    else:
+        if R_len < n:
+            return lazy_select(arr, k, min(theta + 0.05, 1))  # 略微提高 R_len 的大小
+        else:
+            arr.sort()
+            return min_k(arr, k)
@@ -1,16 +1,107 @@
-# This is a sample Python script.
+"""
+比较 3 种中位数选择算法的性能
+ - 算法 1：排序后选择
+ - 算法 2： 确定型中位数线性时间选择 (BFPRT)
+ - 算法 3： 中位数选择随机算法
 
-# Press Shift+F10 to execute it or replace it with your code.
-# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+实验内容：
+ - 实现三种算法
+ - 数据集自己寻找或生成
+ - 运行时间比较，准确度比较
+ - 扩展性比较
+ - 以恰当、准确、规范的形式表述实验结果
+"""
 
+import time
+import numpy as np
+import matplotlib.pyplot as plt
 
-def print_hi(name):
-    # Use a breakpoint in the code line below to debug your script.
-    print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
+from sort_select import sort_select
+from bfprt_select import bfprt_select
+from lazy_select import lazy_select
+from gen_data import gen_data
 
 
-# Press the green button in the gutter to run the script.
-if __name__ == '__main__':
-    print_hi('PyCharm')
+def run_select(arr: list, k_list: list, func) -> list:
+    """测试选择算法, 返回运行结果"""
+    result = []
+    for k in k_list:
+        result.append(func(arr, k))
+    return result
 
-# See PyCharm help at https://www.jetbrains.com/help/pycharm/
+
+def test_all_on_data(arr: list, k_list: list):
+    run_time = []
+
+    start_time = time.time()
+    sort_select_result = run_select(arr, k_list, sort_select)
+    run_time.append(time.time() - start_time)
+
+    start_time = time.time()
+    bfprt_select_result = run_select(arr, k_list, bfprt_select)
+    run_time.append(time.time() - start_time)
+
+    start_time = time.time()
+    lazy_select_result = run_select(arr, k_list, lazy_select)
+    run_time.append(time.time() - start_time)
+
+    if (sort_select_result != bfprt_select_result) or (sort_select_result != lazy_select_result):
+        print("Results are not equal!")
+
+    return run_time
+
+
+def test(data_type: str, n_list: list, iter_num: int):
+    run_times = [[] for _ in range(3)]  # [[sort_select], [bfprt_select], [lazy_select
+    for n in n_list:
+        arr, k_list = gen_data(data_type, n, iter_num)
+        run_time = test_all_on_data(arr, k_list)
+        for i in range(3):
+            run_times[i].append(run_time[i] / iter_num)
+
+    fig = plt.figure(dpi=400)
+    ax = fig.add_subplot(111)
+    ax.plot(n_list, run_times[0], label="sort_select")
+    ax.plot(n_list, run_times[1], label="bfprt_select")
+    ax.plot(n_list, run_times[2], label="lazy_select")
+    ax.set_xlabel("Data Size")
+    ax.set_ylabel("Run Time")
+    ax.set_title(("Run Time of Three Select Algorithms on " + data_type + " Data").title())
+    ax.legend()
+    plt.show()
+
+
+def test_theta(n: int, iter_num: int):
+    theta_list = np.linspace(0.5, 1, 100).tolist()
+    run_times = []
+    for theta in theta_list:
+        arr, k_list = gen_data("uniform", n, iter_num)
+        start_time = time.time()
+        for k in k_list:
+            lazy_select(arr, k, theta)
+        run_times.append((time.time() - start_time) / iter_num)
+
+    fig = plt.figure(dpi=400)
+    ax = fig.add_subplot(111)
+    ax.plot(theta_list, run_times)
+    ax.set_xlabel("Theta")
+    ax.set_ylabel("Run Time")
+    ax.set_title("Run Time of Lazy Select Algorithm on Different Theta")
+    plt.show()
+
+
+def main():
+    # 测试 3 种算法的性能和扩展性
+    iter_num = 3  # 测试次数
+    n_list = np.linspace(10000, 100000, 20, dtype=int).tolist()  # 数据规模
+    data_type_list = ["uniform", "normal", "zipf"]
+    for data_type in data_type_list:
+        test(data_type, n_list, iter_num)
+
+    # 测试随机算法中的关键参数 theta 对性能的影响
+    n = 10000
+    test_theta(n, iter_num)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,15 @@
+def quick_sort(arr: list) -> list:
+    """对 arr 进行快速排序"""
+    if len(arr) <= 1:
+        return arr
+    pivot = arr[len(arr) // 2]
+    left = [x for x in arr if x < pivot]
+    middle = [x for x in arr if x == pivot]
+    right = [x for x in arr if x > pivot]
+    return quick_sort(left) + middle + quick_sort(right)
+
+
+def sort_select(arr: list, k: int) -> int:
+    """将 arr 排序后，返回其中第 k 小的元素"""
+    arr = quick_sort(arr)
+    return arr[k]