Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

""" 

Module provides simple basic classes for sequence generation using Markov models. 

""" 

from __future__ import unicode_literals, division, print_function 

import random 

 

import numpy as np 

 

from lingpy.util import setdefaults 

from lingpy.settings import rcParams 

from lingpy.sequence.sound_classes import ipa2tokens, prosodic_string, tokens2class 

 

 

class MCBasic(object): 

""" 

Basic class for creating Markov chains from sequence training data. 

 

Parameters 

---------- 

seq : list 

A list of sequences. Sequences are assumed to be tokenized, i.e. they 

should be either passed as lists or as tuples. 

 

""" 

 

def __init__(self, seqs): 

self.seqs = seqs 

 

# create distribution 

self.dist = {} 

for seq in self.seqs: 

for i, (s1, s2) in enumerate(zip(['#'] + seq, seq + ['$'])): 

try: 

self.dist[s1] += [s2] 

except: 

self.dist[s1] = [s2] 

 

def walk(self): 

""" 

Create random sequence from the distribution. 

""" 

out = [] 

 

# get the start sequence  

startS = random.choice(self.dist['#']) 

 

out += [startS] 

 

i = 0 

while True: 

nextS = random.choice(self.dist[out[-1]]) 

 

# check for terminal symbol 

if nextS == '$': 

break 

 

out += [nextS] 

i += 1 

return out 

 

 

class MCPhon(MCBasic): 

""" 

Class for the creation of phonetic sequences ("pseudo words"). 

 

Parameters 

---------- 

words : list 

List of phonetic sequences. This list can contain tokenized 

sequences (lists or tuples), or simple untokenized IPA strings. 

 

tokens : bool (default=False) 

If set to True, no tokenization of input sequences is carried out. 

 

prostring : list (default=[]) 

List containing the prosodic profiles of the input sequences. If the 

list is empty, the profiles are generated automatically. 

 

""" 

 

def __init__( 

self, 

words, 

tokens=False, 

prostrings=[], 

classes=False, 

class_model=rcParams['model'], 

**keywords 

): 

setdefaults(keywords, stress=rcParams['stress'], 

diacritics=rcParams['diacritics'], cldf=False) 

self.model = class_model 

self.words = words 

self.tokens = [] 

self.bigrams = [] 

self.classes = [] 

 

# start filling the dictionary 

for i, w in enumerate(words): 

 

# check for tokenized string 

if not tokens: 

tk = ipa2tokens(w, **keywords) 

else: 

tk = w[:] 

self.tokens += [tk] 

 

# create prosodic string 

if prostrings: 

p = prostrings[i] 

else: 

print(w, tk) 

tt = tokens2class(tk, rcParams['art']) 

print(tt) 

p = prosodic_string( 

tk, 

rcParams['art'], 

cldf=keywords['cldf'], 

diacritics=keywords['diacritics'], 

stress=keywords['stress']) 

# create classes 

if classes: 

c = tokens2class(tk, class_model, cldf=keywords['cldf'], 

diacritics=keywords['diacritics'], 

stress=keywords['stress']) 

bigrams = list(zip(p, c)) 

self.classes += [c] 

else: 

# zip the stuff 

bigrams = list(zip(p, tk)) 

 

# start appending the stuff 

self.bigrams += [bigrams] 

 

# init the mother object 

MCBasic.__init__(self, self.bigrams) 

 

def get_string(self, new=True, tokens=False): 

""" 

Generate a string from the Markov chain created from the training data. 

 

Parameters 

---------- 

new : bool (default=True) 

Determine whether the string created should be different from the 

training data or not. 

tokens : bool (default=False) 

If set to *True* he full list of tokens that was internally used to 

represent the sequences as a Markov chain is returned. 

""" 

 

# create the first string 

out = self.walk() 

 

while new: 

if out in self.bigrams: 

out = self.walk() 

else: 

break 

 

if tokens: 

return out 

else: 

return ' '.join([i[1] for i in out]) 

 

def evaluate_string(self, string, tokens=False, **keywords): 

setdefaults(keywords, stress=rcParams['stress'], 

diacritics=rcParams['diacritics'], cldf=False) 

if not tokens: 

tokens = ipa2tokens(string) 

score = 1 

dist = self.dist['#'] 

 

prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'], 

diacritics=keywords['diacritics'], 

stress=keywords['stress']) 

if self.classes: 

c = tokens2class(tokens, self.model, cldf=keywords['cldf'], 

diacritics=keywords['diacritics'], 

stress=keywords['stress']) 

teststring = list(zip(prostring, c)) 

else: 

teststring = list(zip(prostring, tokens)) 

 

scores = [] 

 

while len(teststring) > 0: 

segment = teststring.pop(0) 

freq = dist.count(segment) 

allf = len(dist) 

s = freq / allf 

score = score * s 

scores += [s] 

dist = self.dist[segment] 

score = score * s 

scores += [s] 

lscore = np.log10(score) 

lscore = lscore / len(tokens) 

return score, lscore # np.log10(score)