统计文章里动词词干使用频率

想知道哪些是斯瓦希里语中出现频率最高的动词词干,做一个最常用动词词干列表。方法是:一个词一个词地阅读一篇文章,检测这个词是否是动词的变形(之前的部分是否符合特定变形规则,在脚本里面被称为 pre-set),如果是的话就放进一个字典里。最后统计一下字典里最常出现的那些动词词干。

需要动词词干的列表

[code language=”python”]
# prepare the pre-set
# affirmative
pre1=[‘a’, ‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
pre2=[‘ka’, ‘ki’, ‘li’, ‘me’, ‘mesha’, ‘na’, ‘ta’]
pre3=[‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘mw’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
preS1=set()
for p1 in pre1:
for p2 in pre2:
preS1.add(p1+p2)
for p3 in pre3:
preS1.add(p1+p2+p3)
for p3 in pre3:
preS1.add("hu")
preS1.add("ku")
preS1.add("hu"+p3)
preS1.add("ku"+p3)
# negative present
pre1=[‘ha’, ‘hai’, ‘hai’, ‘haki’, ‘haku’, ‘haku’, ‘hali’, ‘ham’, ‘hamu’, ‘hapa’, ‘hatu’, ‘hau’, ‘havi’, ‘hawa’, ‘hawa’, ‘haya’, ‘hazi’, ‘hu’, ‘si’]
pre2=[‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘mw’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
preS2=set()
for p1 in pre1:
preS2.add(p1)
for p2 in pre2:
preS2.add(p1+p2)
# if subjuntive
pre1=[‘a’, ‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
pre2=[‘si’]
pre3=[‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘mw’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
preS3=set()
for p1 in pre1:
preS3.add(p1)
for p2 in pre2:
preS3.add(p1+p2)
for p3 in pre3:
preS3.add(p1+p3)
preS3.add(p1+p2+p3)

# other negatives
pre1=[‘ha’, ‘hai’, ‘hai’, ‘haki’, ‘haku’, ‘haku’, ‘hali’, ‘ham’, ‘hamu’, ‘hapa’, ‘hatu’, ‘hau’, ‘havi’, ‘hawa’, ‘hawa’, ‘haya’, ‘hazi’, ‘hu’, ‘si’]
pre2=[‘ja’, ‘ku’, ‘ta’]
pre3=[‘i’, ‘ki’, ‘ku’, ‘li’, ‘m’, ‘mu’, ‘mw’, ‘ni’, ‘pa’, ‘tu’, ‘u’, ‘vi’, ‘wa’, ‘ya’, ‘zi’]
preS4=set()
for p1 in pre1:
for p2 in pre2:
preS4.add(p1+p2)
for p3 in pre3:
preS4.add(p1+p2+p3)

# prepare verb dictionary and the document

verbD={}

mydict=open("verb_stems.txt")
dictF=mydict.read()
dict=dictF.split()

myfile=open("1.txt")
line=myfile.readline()
# junk words, which are most likely nouns or adverbs
junk=set([‘hakimu’, ‘kijamii’, ‘kitaalamu’, ‘mbali’, ‘mradi’, ‘mshauri’, ‘mstaafu’, ‘mujibu’, ‘taasisi’, ‘unyonge’, ‘upande’, ‘ushahidi’, ‘utalii’, ‘utamaduni’, ‘vipande’, ‘wachache’, ‘wajibu’, ‘washauri’, ‘washiriki’, ‘wataalamu’, ‘watalii’])
# processing the words

def ongeza():
if l not in verbD.keys():
verbD[l]=1
elif l in verbD.keys():
verbD[l]+=1
print l+" in "+word

while line:
words=line.split()
for word in words:
stop=1
word=word.strip(".,!?").lower()
if word not in junk:
for l in dict:
# if the verb stem located as the end of the word, in affirmative
if word.rfind(l)>0 and word.rfind(l)+len(l)==len(word) and word[:word.rfind(l)] in preS1 and stop:
ongeza()
stop=0
# if the verb stem located as the end of the word, in negative present
lN=l[:-1]+l[-1].replace("a","i")
if word.rfind(lN)>0 and word.rfind(lN)+len(lN)==len(word) and word[:word.rfind(lN)] in preS2 and stop:
ongeza()
stop=0
# if subjunctive
lS=l[:-1]+l[-1].replace("a","e")
if word.rfind(lS)>0 and word.rfind(lS)+len(lS)==len(word) and word[:word.rfind(lS)] in preS3 and stop:
ongeza()
stop=0
# if the verb stem located as the end of the word, in negative
if word.rfind(l)>0 and word.rfind(l)+len(l)==len(word) and word[:word.rfind(l)] in preS4 and stop:
ongeza()
stop=0
line=myfile.readline()
#print verbD
for key in sorted(verbD.keys()):
print key+": "+str(verbD[key])
[/code]

结果(截选):
[code]
choshwa: 3
chukua: 3
daiwa: 2
dumu: 2
eleza: 10
elezea: 6
endekeza: 1
endelea: 8
endeleza: 4
endesha: 2
epuka: 1
epusha: 1
fafanua: 1
fahamisha: 1
fahamu: 1
faidika: 1
fanya: 7
[/code]

 

ynshen