Erlang實戰建立文字索引

為文字建立索引是文字資訊處理的乙個重要的任務，給定乙個由英文單詞構成的檔案，為檔案中所有單詞建立索引，記錄每個單詞出現的行號和每行出現的次數，並將索引存入乙個檔案。在erlang實戰練習（六）中我強調了當時建立文字索引的時候太粗糙，一是使用了盡量規避的程序字典的方式；二是分詞使用的是正規表示式，不夠靈活。本文將改進我以前建立文字索引的方式，使用ets來儲存單詞及其索引列表，同時拆分詞使用erlang提供的string：token模組，更加靈活和可移植性。

word_index.erl檔案的總體結構如下：

-module
(word_index).
-export([start/2]).
-import(re, [run/2,replace/4]).
-import(string,[substr/3]).
%% start兩個引數：filein表示要建立索引的文字檔案，fileout表示索引儲存的目標檔案
start(filein,fileout) -> =file:open(filein,read),%% 唯讀開啟filein檔案
if_first =:= ok ->linelist = readfile(second,0),%% 函式readfile/2的功能是將文字以行為單位，存入列表 
%io:format("~nfile contents:~p~n",[linelist]),
tableid =ets:new(index,[ordered_set]),%% ets:new建立乙個「鍵值」搜尋表，儲存鍵值對映元祖，設定表名為index,表的型別為ordered_set 
index(fileout,linelist,tableid);%% 為文字中的每一行建立單詞索引 
_first =/= ok ->io:format("open file error: file doesn't exist!")
end.

readfile/2函式**如下：

%% 讀取文字每一行，以為元組存入列表中
readfile(s, lineno) ->readfile(s,lineno,).
readfile(s, lineno, ret) ->updatelineno = lineno +1,
oneline = io:get_line(s,''),%% 讀取檔案中的一行內容 
ifoneline =:= eof ->io:format("read file eof!"),
file:close(s),
lists:reverse(ret);
oneline =/= eof ->readfile(s,updatelineno, [ |ret])
end.

index/3函式**如下：

index(file,linelist,tableid) ->
iflength(linelist) =:= 0 ->tolist =ets:tab2list(tableid),
io:format("index is:~n~p~n",[tolist]),
writetofile(file,tolist),
io:format("create index success! ");
length(linelist) =/= 0 ->first = lists:nth(1,linelist),
processoneline(first,tableid),
index(file,lists:delete(first, linelist), tableid) 
end.
%% 處理一行文字
processoneline(oneline, tableid) -> =oneline,
%io:format("line no:~p~n",[lineno]),
words = string:tokens(element,"\n\t "),
matchwords(words,lineno,tableid).
matchwords(, lineno, tableid) ->io:format("process line(~p) success!~n",[lineno]);
matchwords(words, lineno, tableid) ->
%io:format("words:~p~n",[words]),
word = lists:nth(1,words),
_value =ets:lookup(tableid,word),%%返回值為匹配word的元組列表 
iflength(_value) =:= 0 -> %% word還未被索引，直接插入此word索引 
ets:insert(tableid,]} );
length(_value) =/= 0 -> %% word已被索引，更新word索引列表 
kvs = lists:nth(1,_value),
value = element(2,kvs),
ets:insert(tableid, )
end,
matchwords(lists:delete(word, words), lineno, tableid).
%% 處理行號與出現次數元組列表
insertrec(list,lineno) ->insertrec(list,lineno,length(list)).
insertrec(list, lineno, 0) ->[ |list];
insertrec(list, lineno, ret) ->first =lists:nth(ret,list),
=first,
ifln =:= lineno ->temp =lists:delete(first, list),
[ |temp];
ln =/= lineno ->insertrec(list, lineno, ret-1)
end.
%% 將索引寫入檔案
writetofile(file,tolist) -> =file:open(file,write),
lists:foreach(
fun(x) -> io:format(s,"~p.~n",[x]) end
, tolist),
file:close(s).

至此，我已經將使用ets儲存鍵值大型表來儲存單詞索引列表的程式講完了，大家自己回去動手試驗吧。本文是繼續上文的乙個續篇，是一種改進的建立文字索引方式。以後我還好繼續通過實戰練習來**erlang的學習與總結思考，謝謝大家的關注。

Erlang實戰建立文字索引

建立文字區域

建立文字形式的Sitemap

建立文字框背景

Erlang實戰 建立文字索引

建立文字區域

建立文字形式的Sitemap

建立文字框背景

相關推薦

Erlang實戰建立文字索引