雜湊基於詞頻的檔案相似度

實現一種簡單原始的檔案相似度計算，即以兩檔案的公共詞彙佔總詞彙的比例來定義相似度。為簡化問題，這裡不考慮中文（因為分詞太難了），只考慮長度不小於3、且不超過10的英文單詞，長度超過10的只考慮前10個字母。

輸入首先給出正整數n（<= 100），為檔案總數，隨後按以下格式給出每個檔案的內容：首先給出檔案正文，最後在一行中只給出乙個字元「#」，表示檔案結束。在n個檔案內容結束之後，給出查詢總數m（<= 10^4），隨後m行，每行給出一對檔案編號，其間以空格分隔。這裡假設檔案按給出的順序從1到n編號。

針對每一條查詢，在一行中輸出兩檔案的相似度，即兩檔案的公共詞彙量佔兩檔案總詞彙量的百分比，精確到小數點後1位。注意，這裡的乙個「單詞」只包含僅由英文本母組成的、長度不小於3、且不超過10的英文單詞，長度超過10的只考慮前10個字母。單詞間以任何非英文本母隔開。另外，大小寫不同的同一單詞被認為是相同的單詞，例如「you」和「you」是同乙個單詞。

將每個單詞利用雜湊函式對映到對應的雜湊表中，同時將檔案編號插入到雜湊表中的倒排索引表，之後將單詞在雜湊表中的位置存入每個檔案的詞彙索引表。計算兩個檔案的相似度時，只需要選擇詞彙量較小的那個檔案，遍歷該檔案的詞彙索引表，找到單詞在雜湊表中的位置並掃瞄該單詞的倒排索引表，如果倒排索引表中的檔案編號與另乙個檔案的編號相同，則說明該單詞同時出現在兩個檔案中。

極致碼農題

#include #include #include #include #define maxs 10
#define mins 3
#define maxb 5
#define maxtable 500009
typedef char elementtype[maxs + 1];
typedef struct fileentrywlist;
typedef struct wordentryflist;
struct hashentry;
typedef struct hashtblhashtable;
hashtable* table_init(int tablesize)
return h;
}wlist* fileindex_init(int size)
return f;
}int getword(elementtype word)
while (isalpha(c))
scanf("%c", &c);
if (p < mins)
return getword(word);
else
}int hash(char *key,int p)
int find(elementtype key, hashtable *h)
return pos;
}int insertandindex(int fileno, elementtype key, hashtable *h)
else
return -1;
}void fileindex(wlist *file, int fileno, int pos)
double work(wlist *file, int f1, int f2, hashtable *h)
temp = 0;
w = file[f1-1].next;
while (w) 
if (f)
temp++;
w = w->next;
}return ((double)(temp * 100)/ (double)(file[f1 - 1].words + file[f2 - 1].words - temp));
}int main()
return 0;
}

檔案的詞彙索引表：

typedef struct fileentrywlist;

簡化版雜湊表定義以及初始化：

typedef struct wordentryflist;
struct hashentry;
typedef struct hashtblhashtable;
hashtable* table_init(int tablesize)
return h;
}

初始化檔案索引表：

wlist* fileindex_init(int size)
return f;
}

讀取單詞：

int getword(elementtype word)
while (isalpha(c)) //跳過超長的字母（相當於只讀取、不儲存）
scanf("%c", &c);
if (p < mins) //太短的單詞不要，讀下乙個
return getword(word);
else
}

字串移位法雜湊函式（雜湊函式）：

int hash(char *key,int p)

在雜湊表中分配單詞及查詢單詞的位置：

int find(elementtype key, hashtable *h)
return pos;
}

將單詞插入雜湊表，同時插入對應的倒排索引表：

int insertandindex(int fileno, elementtype key, hashtable *h)
else
return -1; //同一檔案重複單詞，不插入
}

將單詞在雜湊表中的位置存入檔案索引表：

void fileindex(wlist *file, int fileno, int pos)

計算兩個檔案之間的相似度：

double work(wlist *file, int f1, int f2, hashtable *h) //選擇詞彙量較小的那個檔案作為檔案索引表的檔案
temp = 0; //統計公共詞彙量
w = file[f1-1].next; //掃瞄檔案的詞彙索引表
while (w) 
if (f)
temp++; 說明該單詞是公共的
w = w->next;
}//兩檔案的詞彙總量 = 兩檔案詞彙量的和 - 公共詞彙量
return ((double)(temp * 100)/ (double)(file[f1 - 1].words + file[f2 - 1].words - temp));
}

主程式部分：

int main()
return 0;
}

雜湊基於詞頻的檔案相似度

基於詞頻的檔案相似度

7 44 基於詞頻的檔案相似度（30 分）

7 44 基於詞頻的檔案相似度（30 分

雜湊 基於詞頻的檔案相似度

基於詞頻的檔案相似度

7 44 基於詞頻的檔案相似度（30 分）

7 44 基於詞頻的檔案相似度 （30 分

相關推薦

雜湊基於詞頻的檔案相似度

7 44 基於詞頻的檔案相似度（30 分