半年多沒碰程式..
編譯器塵封許久,都快忘記介面的樣子囉..
這半年都在練習腦力外的活動,
發現「學習」這件事,都有個特質,
凡非天生就會的東西,總是要練習到一個質量後,
才能穩定固著下來,退步的速度也才會變慢。
而無論是聰明或駑鈍,
想通達「完全瞭解」的路,都會是一樣「長」....
2010年1月3日 星期日
Modify k-Means
對 k-Means 的簡單改良。code 寫的不算好。
底下測試框架生成 2D GMM 群落,對各算法進行比對:
比對圖例:
3 - GM:
6 - GM:
#define ef else if template <typename T> inline void set_vector (T* out, const T* in, int n) { while (n-- > 0) out[n] = in[n]; } template <typename T> inline void add_vector (T* out, const T* in, int n) { while (n-- > 0) out[n] += in[n]; } template <typename T> inline void div_vector (T* v, int divisor, int n) { while (n-- > 0) v[n] /= divisor; } template <typename T> inline double cal_distance (T* a, T* b, int n) { double sum = 0, diff; while (n-- > 0) { diff = a[n] - b[n]; sum += diff * diff; } return sqrt (sum); } template <typename T = float> struct Kmeans { int nMaxIteration; //最大迭代次數 int cur_iter; //目前的迭代次數 int nData; //資料項數 int* label; //資料項的分類標籤 int* ccount; //群心涵蓋的資料個數 T** data; //指向資料集合 T** center; //群心集合 T* radius; //群心的超球面半徑 T* new_center; //計算暫用的群心 T* mean; //資料項的幾何中心 int nCenter; //群心個數 int dim; //資料維度 double cluster_movement; //群心更新時的最大移動量 Kmeans() { srand ((unsigned int)time(0)); label = 0; ccount = 0; radius = 0; center = 0; mean = 0; new_center = 0; } ~Kmeans() { release(); } protected: void set (T* out, const T* in) {set_vector (out, in, dim);} void add (T* out, const T* in) {add_vector (out, in, dim);} void div (T* v, int divisor) {div_vector (v, divisor, dim);} double distance (T* a, T* b) {return cal_distance (a, b, dim);} void release() //釋放資源 { if (label) del_arr (label); label = 0; if (ccount) del_arr (ccount); ccount = 0; if (radius) del_arr (radius); radius = 0; if (center) del_arr (center); center = 0; if (mean) del_arr (mean); mean = 0; if (new_center) del_arr (new_center); new_center = 0; } void init_centers () { if (nCenter < 1) ERROR_MSG_("群心數目需 > 1"); if (nCenter > nData) nCenter = nData; release(); //釋放資源 new_center = (T*) new_arr (sizeof(T), 1, dim); radius = (T*) new_arr (sizeof(T), 1, nCenter); mean = (T*) new_arr (sizeof(T), 1, dim); center = (T**) new_arr (sizeof(T), 2, nCenter, dim); ccount = (int*) new_arr (sizeof(int), 1, nCenter); label = (int*) new_arr (sizeof(int), 1, nData); int i; for (i=0; i<nCenter; ++i) //隨機指定樣本點為群心 set (center[i], data [rand() % nData]); for (i=0; i<nData; ++i) //計算資料項的幾何中心 add (mean, data[i]); div (mean, nData); } void cluster_assignment () //將樣本分群 { int label_id, j, k; double min_dist, d; memset (ccount, 0, nCenter* sizeof(int)); memset (radius, 0, nCenter* sizeof(T)); for (j=0; j<nData; ++j) { label_id = 0; min_dist = distance (center[0], data[j]); for (k=1; k<nCenter; ++k) { d = distance (center[k], data[j]); if (d < min_dist) { label_id = k; min_dist = d; } } if (radius[label_id] < min_dist) radius[label_id] = min_dist; label[j] = label_id; //歸類 ccount[label_id]++; //統計群心涵蓋的資料個數 } } void update_cluster () //重新估算 centroids 位置 { int j, k, total; double dist = 0; cluster_movement = 0; for (k=0; k<nCenter; ++k) { memset (new_center, 0, dim* sizeof(T)); total = 0; for (j=0; j<nData; ++j) if (label[j] == k) { add (new_center, data[j]); total++; } if (total > 0) { div (new_center, total); dist = distance (center[k], new_center); if (cluster_movement < dist) cluster_movement = dist; set (center[k], new_center); } else set (center[k], mean); } } public: //算法主流程: // nCenter = 群心數目, // data = 資料集合 = data [nData_] [dim_] void do_cluster (int nCenter_, T** data_, int nData_, int dim_, int nIteration_, double movement_threshold) { setup (nCenter_, data_, nData_, dim_); nMaxIteration = nIteration_; cur_iter = 0; do { cluster_assignment (); //將樣本分群 update_cluster (); //重新估算 centroids 位置 } while (++cur_iter < nMaxIteration && cluster_movement > movement_threshold); } //單步執行 clustering, 回傳群心最大移動量 void setup (int nCenter_, T** data_, int nData_, int dim_) { nCenter = nCenter_; //設定群心數目 data = data_; //指向資料集合 nData = nData_; //設定資料個數 dim = dim_; //設定資料維度 cur_iter = 0; init_centers (); //初始化群心位置 } double iterater_cluster () { cur_iter++; cluster_assignment (); //將樣本分群 update_cluster (); //重新估算 centroids 位置 return cluster_movement; //回傳群心更新時的最大移動量 } }; //-------------------------------------------------------------- // Modify K-Means //-------------------------------------------------------------- template <typename T = float> struct ModifyKmeans: Kmeans<T> { enum {MAX_SELECT = 32}; //每回合最大的群心挑選個數 T** best_center; //計算暫用的群心集合 T** old_center; //計算暫用的群心集合 T* avg_radius; //群心的平均半徑 bool* bAlive; //群心存活狀態(用於合併時) int nMaxCenter; //最大群心數 int nMaxSelect; //最大的群心挑選個數 int nMinCount; //每個群集至少應包含的資料數 double movement_threshold; //指定的群心移動量門檻 double avg_dist; //每一點到群心的平均距離 CvRNG rng; //亂數種子 ModifyKmeans() { best_center = 0; old_center = 0; avg_radius = 0; bAlive = 0; rng = cvRNG (cvGetTickCount()); } ~ModifyKmeans() {release();} void release() //釋放資源 { if (best_center) del_arr (best_center); if (old_center) del_arr (old_center); if (avg_radius) del_arr (avg_radius); if (bAlive) del_arr (bAlive); old_center = 0; best_center = 0; avg_radius = 0; bAlive = 0; Kmeans<T>::release(); } void setup (int nMaxCenter_, T** data_, int nData_, int dim_) { nMaxCenter = nMaxCenter_; //設定群心數目 data = data_; //指向資料集合 nData = nData_; //設定資料個數 dim = dim_; //設定資料維度 cur_iter = 0; nCenter = 1; //目前輪迴的群心數 nMaxSelect = MAX_SELECT; if (nMaxCenter < 1) ERROR_MSG_("最大群心數目需 > 1"); if (nMaxCenter > nData) nMaxCenter = nData; if (nMaxSelect > nData) ERROR_MSG3_("資料項數(%d)需 > 最大的群心挑選個數(%d)", nData, nMaxSelect); release(); //釋放資源 new_center = (T*) new_arr (sizeof(T), 1, dim); radius = (T*) new_arr (sizeof(T), 1, nMaxCenter); bAlive = (bool*)new_arr (sizeof(bool),1, nMaxCenter); avg_radius = (T*) new_arr (sizeof(T), 1, nMaxCenter); best_center = (T**) new_arr (sizeof(T), 2, nMaxCenter, dim); old_center = (T**) new_arr (sizeof(T), 2, nMaxCenter, dim); mean = (T*) new_arr (sizeof(T), 1, dim); center = (T**) new_arr (sizeof(T), 2, nMaxCenter, dim); ccount = (int*) new_arr (sizeof(int), 1, nMaxCenter); label = (int*) new_arr (sizeof(int), 1, nData); for (int i=0; i<nData; ++i) { //計算資料項的幾何中心 add (mean, data[i]); label[i] = 0; } div (mean, nData); set (center[0], mean); //第1個群心是幾何中心 shuffle (data, nData); //洗牌, 將資料內序打亂 } void cluster_assignment () //將樣本分群 { int label_id, i, j, k; double min_dist, d; memset (ccount, 0, nCenter* sizeof(int)); memset (radius, 0, nCenter* sizeof(T)); for (j=0; j<nData; ++j) { label_id = 0; min_dist = distance (center[0], data[j]); for (k=1; k<nCenter; ++k) { d = distance (center[k], data[j]); if (d < min_dist) { label_id = k; min_dist = d; } } if (radius[label_id] < min_dist) //計算每一群的最大半徑 radius[label_id] = min_dist; label[j] = label_id; //歸類 ccount[label_id]++; //統計群心涵蓋的資料個數 } } void update_cluster () //重新估算 centroids 位置 { int j, k, total; double dist = 0; cluster_movement = 0; for (k=0; k<nCenter; ++k) { memset (new_center, 0, dim* sizeof(T)); total = 0; for (j=0; j<nData; ++j) if (label[j] == k) { add (new_center, data[j]); total++; } if (total > 0) { div (new_center, total); dist = distance (center[k], new_center); if (cluster_movement < dist) cluster_movement = dist; //計算最小的群心移動量值 set (center[k], new_center); } else set (center[k], mean); } } void cal_avg_dist() //計算每個群心的平均半徑 { //以及總平均半徑 int j, k, total; int nResetCenter = 0; double dist_sum = 0; double cur_dist = 0; //用來加總每個群心的平均半徑 for (k=0; k<nCenter; ++k) { total = 0; dist_sum = 0; //計算點集到群心的距離總和 for (j=0; j<nData; ++j) if (label[j] == k) { dist_sum += distance (center[k], data[j]); total++; } if (total > 0) { dist_sum /= total; avg_radius[k] = dist_sum; //記錄群心的平均半徑 cur_dist += dist_sum; //加總每個群心的平均半徑 } else nResetCenter++; } //取得每一點到群心的平均距離 avg_dist = cur_dist / (nCenter - nResetCenter); } void merge_center (int parent, int child) { memset (new_center, 0, dim* sizeof(T)); int total = 0; for (int j=0; j<nData; ++j) if (label[j] == parent || label[j] == child) { add (new_center, data[j]); label[j] = parent; total++; } if (total > 0) { div (new_center, total); set (center[parent], new_center); } } void joint_clusters () //合併群內之群與強連通群 { int i, k, j; double d; for (k=0; k<nCenter; ++k) bAlive[k] = true; for (k=0; k<nCenter; ++k) //合併群內之群 if (bAlive[k]) for (j=0; j<nCenter; ++j) if (bAlive[j] && k != j) { d = distance (center[j], center[k]); if (d + radius[j] < radius[k]) { merge_center (k, j); bAlive[j] = false; } } for (j=k=0; k<nCenter; ++k) //重設群的陣列 if (bAlive[k]) { if (k != j) { set (center[j], center[k]); for (i=0; i<nData; ++i) if (label[i] == k) label[i] = j; } j++; } nCenter = j; } void cluster_reassignment () { double min_dist, d; int label_id, k, j; for (j=0; j<nData; ++j) { label_id = 0; min_dist = distance (center[0], data[j]); min_dist -= avg_radius[0]; for (k=1; k<nCenter; ++k) { d = distance (center[k], data[j]); d -= avg_radius[k]; if (d < min_dist) { label_id = k; min_dist = d; } } if (radius[label_id] < min_dist) //計算每一群的最大半徑 radius[label_id] = min_dist; label[j] = label_id; //歸類 ccount[label_id]++; //統計群心涵蓋的資料個數 } } bool kmeans (int k_centers) { nCenter = k_centers; //設定群心個數 int iter = 0; do { cluster_assignment (); //將樣本分群 //並算出每群的最大半徑 if (ccount[nCenter-1] < nMinCount) //若新進群心的包含點數過少 return false; //便捨棄之, 再選別的來試 update_cluster (); //重新估算 centroids 位置 cal_avg_dist (); //計算每個群心的平均半徑 } while (++iter < nMaxIteration && cluster_movement > movement_threshold); return true; } public: void do_cluster (int nMaxCenter_, //最大群心數 T** data_, int nData_, int dim_, //資料 int nMaxIteration_, //最大迭代次數 double mov_threshold_, //誤差門檻: 判別是否收斂 double min_cluster_ratio) //規定每個群集至少應包含 //nData * ratio 筆資料 { setup (nMaxCenter_, data_, nData_, dim_); set (best_center[0], center[0]); nMinCount = int (nData * min_cluster_ratio); nMaxIteration = nMaxIteration_; //內部 K-means 的最大迭代次數 movement_threshold = mov_threshold_; cur_iter = 0; avg_dist = 0; double old_dist = (std::numeric_limits<double>::max)(); int i, j, k; int nCurCenter = 2; //目前的群心個數 //=2 是因為開始時, 以幾何 //中心為主, 就已分了一群 do { double cost = 0; for (j=0; j<nCurCenter-1; ++j) //將目前的最佳群心 set (old_center[j], center[j]); //保存到 old_center 中 for (i=0; i<nMaxSelect; ++i) //嘗試 nMaxSelect 次 { cur_iter++; //計算 kMean 總共做了幾次 do { //持續嘗試做 k-Means j = nCurCenter - 1; //直到生成一個有效的群心 k = cvRandInt (&rng) % nData; set (center[j], data[k]); } while (!kmeans (nCurCenter)); //分成 nCurCenter 群 if (i == 0) cost = avg_dist; else //是否更新最佳群心集合? if (avg_dist < cost) { cost = avg_dist; for (j=0; j<nCurCenter; ++j) set (best_center[j], center[j]); } } for (j=0; j<nCurCenter; ++j) //將最佳群心拷貝到 center set (center[j], best_center[j]);//供下一輪 k-Mean 使用 // 終止條件判斷 if (old_dist - avg_dist > 2) old_dist = avg_dist; else break; } while (++nCurCenter <= nMaxCenter_); nCenter = nCurCenter - 1; for (j=0; j<nCenter; ++j) //將最佳群心拷貝到 set (center[j], old_center[j]); //center 中 cluster_assignment (); //將資料歸屬到最終的群心 for (i=0; i<2; ++i) { //合併相近的群 joint_clusters (); //共 refine 2 次 cal_avg_dist (); cluster_reassignment (); } };
底下測試框架生成 2D GMM 群落,對各算法進行比對:
template <class T> struct KMeanTest { enum {CD = 3, MAX_CLUSTERS = (CD+1)*(CD+1)*(CD+1), //最大群心數 W=500, H=500}; //展示影像的寬高 CvScalar color_tbl[MAX_CLUSTERS]; IplImage* img [ALGO_NUM]; //2D展示影像 T** data; //資料集合 int nCenter; //群心個數 int nData; //資料項數 int dim; //資料維度 void generate_color_table() { int r, g, b, i=0, d = 255/CD; for (r=0; r<256; r+=d) for (g=0; g<256; g+=d) for (b=0; b<256; b+=d) color_tbl[i++] = CV_RGB (r,g,b); shuffle (color_tbl+1, MAX_CLUSTERS-2); } //以 nCluster 個高斯 (正態) 分佈產生樣本點 //若 nCluster = -1 , 就隨機產生樣本點 //若 nCluster = 0 , 就隨機令 nCluster = 1 ~ MAX_CLUSTERS void generate_sample (int nCluster = -1) { release(); double sx, sy; int cx, cy; int i = 0; data = (T**) new_arr (sizeof(T), 2, nData, dim); if (nCluster == -1) //若中心點數 = -1 for (int i=0; i<nData; ++i) { //就隨機產生樣本點 data[i][0] = T (rand() % W); data[i][1] = T (rand() % H); } else { if (nCluster == 0) nCluster = rand() % MAX_CLUSTERS + 1; CvRNG rng = cvRNG (cvGetTickCount()); CvMat* points = cvCreateMat (nData, 1, CV_32SC2); for (int k = 0; k < nCluster; ++k) //generate random sample { //from multi-gaussian //distribution const int B = 80; //中心點離邊框的距離 cx = B + rand() % (W-2*B); cy = B + rand() % (H-2*B); sx = W * ((0.01 + cvRandReal (&rng))/ 10.0); sy = H * ((0.01 + cvRandReal (&rng))/ 10.0); //令群數 = A, A = (nData/ nCluster) * (1 +- 0.1) double ratio = 1 + (cvRandReal (&rng) - 0.5)/5; points->rows = int ((nData / nCluster) * ratio); //到最後一群時, 若還剩下很多名額, 就都分配給它 if (k == nCluster-1 && points->rows < nData-i) points->rows = nData - i; cvRandArr (&rng, points, CV_RAND_NORMAL, cvScalar (cx, cy), //平均數 cvScalar (sx, sy)); //標準差 for (int j=0; j<points->rows; ++j) { data[i][0] = (T) points->data.i[j*2]; data[i][1] = (T) points->data.i[j*2+ 1]; i++; if (i >= nData) break; } if (i >= nData) break; } cvReleaseMat( &points ); } } };
比對圖例:
3 - GM:
6 - GM:
訂閱:
文章 (Atom)