Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

#!/usr/bin/python 

# -*- coding: utf-8 -*- 

import csv 

import sys 

import logging 

import re 

from datetime import date, datetime 

 

from members.models import Member 

 

logging.getLogger().setLevel(logging.DEBUG) 

 

 

class TransactionReader: 

""" 

parses information from lines of a csv as received from bank, including chaos-nr 

""" 

def __init__(self, line_list): 

self.string_in = ';'.join(line_list) 

logging.debug("string_in: {}".format(self.string_in)) 

try: 

self.booking_day = datetime.strptime(line_list[0], '%d.%m.%Y').date() 

self.available_on = datetime.strptime(line_list[1], '%d.%m.%Y').date() 

except Exception as e: 

logging.warning(e) 

self.booking_day = None 

self.available_on = None 

self.payment_type = line_list[2] 

logging.debug("payment_type: {}".format(self.payment_type)) 

self.information = line_list[3] 

self.referenz = self.get_referenz(self.information) 

logging.debug("referenz: {} ".format(self.referenz)) 

self.verwendungszweck = self.get_verwendungszweck(self.information) 

logging.debug("verwendungszweck: {}".format(self.verwendungszweck)) 

self.payer = line_list[4] 

self.payee = line_list[5] 

logging.debug("payer, payee: {}, {} ".format(self.payer, self.payee)) 

try: 

self.amount = int(re.sub(r'[^0-9\-]', '', line_list[6])) 

self.balance = int(re.sub(r'[^0-9\-]', '', line_list[7])) 

except ValueError: 

self.amount = 0 

self.balance = 0 

logging.debug("amount, balance: {} {} ".format(self.amount, self.balance)) 

self.member = None 

self.rating = None 

# mark only incoming for import 

self.income = False 

49 ↛ exitline 49 didn't return from function '__init__', because the condition on line 49 was never false if self.payee in ["Chaos Computer Club e.V.", "CHAOS COMPUTER CLUB E.V."]: 

self.income = True 

self.member, self.rating, self.donation = self.get_chaos_number() 

logging.info("CHAOSNR from get chaos number {}".format(self.member)) 

 

def __str__(self): 

return str(self.__dict__) 

 

def get_referenz(self, string): 

try: 

m = re.search('Referenz (.*)Verwendungszweck .*', string) 

ret = m.groups()[0].strip() 

except AttributeError: 

ret = '' 

return ret 

 

def get_verwendungszweck(self, string): 

try: 

m = re.search('Verwendungszweck (.*)', string) 

ret = m.groups()[0].strip() 

except AttributeError: 

ret = string 

return ret 

 

def get_chaos_number(self): 

logging.info("RUN GET CHAOS NUMBER") 

 

""" 

parses and rates best guess Chaos Number from verwendungszweck 

:return: Sorted list of potential Chaos Numbers, rating 

""" 

def split_numbers_are_split(word): 

split_num_re = re.compile('[0-9]+') 

nums = split_num_re.findall(word) 

if len(nums) > 1: 

out = [num + 'part' for num in nums] 

elif len(nums) == 0: 

out = [word] 

else: 

out = nums 

return out 

 

firstwords = self.verwendungszweck.split() 

words = [] 

for word in firstwords: 

word = strip_punctuation(word) 

words += split_numbers_are_split(word) 

logging.info(words) 

wordscores = [WordScore(word) for word in words] 

finalscores = WordsWithScore(wordscores) 

sorted_chaos_numbers, rating, donation = finalscores.sort_rate_donate() 

return sorted_chaos_numbers, rating, donation 

 

 

def clean_number(pre_number): 

new_number = re.sub(r'^\D*|\D*$', '', pre_number) 

return new_number 

 

 

def strip_punctuation(word): 

punctuation = ['.', ',', ':', ';', '(', ')'] 

while len(word) > 0 and word[0] in punctuation: 

word = word[1:] 

while len(word) > 0 and word[-1] in punctuation: 

word = word[:-1] 

return word 

 

 

class WordScore: 

""" 

Receives a word (from the verwendungszweck) and scores similarity to Chaos Number independent of other words 

""" 

 

# some penalties here, some with WordsWithScore (interdependent penalties) 

# penalties set up to be hierarchical, is_part < not_in_db < multiple_numbers < no_key < neighboring < year 

# status sets to unknown chaos number, if al of first four, or either of last two returns true (for best candidate) 

# otherwise scores used for sorting / choosing best number of multiple candidates 

PENALTIES = {"not_in_db": 2, 

"is_recent_year": 32, 

"is_part": 1} 

 

def __init__(self, word): 

self.word = word 

self.word_class = self.get_word_class() 

self.score = None 

if self.word_class == "number": 

self.set_score() 

 

# we're playing golf, and low is better 

def set_score(self): 

""" 

increments score (lower is better) if number is partial, correpsonds to no member in db, or is a recent year 

""" 

self.score = 0 

self.score_part() 

self.clean() # have to call partial-number-or-not (.score_part) before this, anything requiring integer, after 

self.score_not_in_db() 

self.score_recent_year() 

 

def get_word_class(self): 

""" 

classifies word as year, number, number_like (some numeric characters), or chaos_key (e.g. 'chaosnummer') 

:param word: string without spaces 

:return: string for class/type/category 

""" 

word = self.word 

word_class = 'unknown' 

# trip punctuation from start and end of word 

 

part_num_re = re.compile(r'\d+') # should match anything containing digits 

# should match many variations of chaosnummer/nummer/nr 

chaos_key_re = re.compile('chaos.*[rno]$|^n[ro]?$|^num.*r$', re.IGNORECASE) 

# match indications this is a donation, not membership 

donation_re = re.compile('spende|dank|donat|thank|eyes open|post ist lame', re.IGNORECASE) 

# match indicating this 

membership_re = re.compile('beitrag|geb.+hr|dues|fee|member', re.IGNORECASE) 

 

if part_num_re.search(word): 

word_class = 'number' 

elif chaos_key_re.search(word): 

word_class = 'chaos_key' 

elif membership_re.search(word): 

word_class = 'membership' 

elif donation_re.search(word): 

word_class = 'donation' 

return word_class 

 

def clean(self): 

self.word = clean_number(self.word) 

 

def score_not_in_db(self): 

"""fetches members that would match putative chaos number, and penalizes no match""" 

if not Member.objects.filter(pk=int(self.word)).exists(): 

self.score += self.PENALTIES["not_in_db"] 

 

def score_recent_year(self): 

"""penalizes recent, current, and next years""" 

year = datetime.now().year 

recent_years = [year + x for x in range(-5, 2, 1)] 

if int(self.word) in recent_years: 

self.score += self.PENALTIES["is_recent_year"] 

 

def score_part(self): 

"""penalizes alphanumeric or punctuation mixes, punctuation has been pre-stripped from start and end""" 

full_num_re = re.compile(r'^\d+$') # should match only integers 

if not full_num_re.search(self.word): 

self.score += self.PENALTIES["is_part"] 

 

 

class WordsWithScore: 

""" 

detects and handles all inter-dependent scoring of words from verwendungszweck 

""" 

PENALTIES = {"no_key": 8, 

"multiple_numbers": 4, 

"neighboring": 16} 

 

def __init__(self, words): 

""" 

takes individually scored chaosnumber candidates, and runs interdependent scoring measures 

:param words: 

""" 

self.words = words 

self.nums = [i for i in range(len(self.words)) if self.words[i].word_class == "number"] 

self.score_no_keys() 

self.score_neighboring() 

self.score_multiple_numbers() 

 

def score_no_keys(self): 

""" 

penalty for numbers not preceded by chaos key (relative bonus for e.g. chaosnummer. 123) 

:return: 

""" 

for i in self.nums: 

if i == 0: 

self.words[i].score += self.PENALTIES["no_key"] 

elif self.words[i - 1].word_class != "chaos_key": 

self.words[i].score += self.PENALTIES["no_key"] 

 

def score_neighboring(self): 

""" 

extra penalty for tied neighboring numbers (think e.g. 12 3, or 201 4) 

:return: 

""" 

 

# compares score of i to that of before and after 

def get_neighbors(i, n): 

""" gets neighbors, handles end of list """ 

if n == 0: 

neighbors = [i] 

elif i == 0: 

neighbors = [i, i + 1] 

elif i == n: 

neighbors = [i - 1, i] 

else: 

neighbors = [i - 1, i, i + 1] 

return neighbors 

 

to_add = [] # list to note penalties until all are calculated and then apply 

for i in self.nums: # for each number from the verwendungszweck 

neighbors = get_neighbors(i, len(self.words) - 1) # get neighbors 

num_neighbors = [x for x in neighbors if self.words[x].word_class == "number"] # filter to numbers 

if not self.is_lowest(i, num_neighbors): 

to_add.append(i) # note penalty if current number's score is not the best of neighbor's scores 

# apply penalties 

for i in to_add: 

self.words[i].score += self.PENALTIES["neighboring"] 

 

def score_multiple_numbers(self): 

""" 

penalty for each number from verwendungszweck with tied or better chaos number candidate 

""" 

nums = self.nums 

n_range = range(len(nums)) 

to_penalize = [] # stores penalties prior to application 

for j in n_range: 

if not self.is_lowest(nums[j], nums): 

to_penalize.append(nums[j]) # notes penalty for every number that isn't the lowest 

# apply penalties 

for num in to_penalize: 

self.words[num].score += self.PENALTIES["multiple_numbers"] 

 

def is_lowest(self, i, subnums): 

""" 

checks if the WordScore.score of index = i in self.words is lower than that of the indexes in subnums 

:param i: index of target number 

:param subnums: indexes of all other candidate numbers 

:return: Boolean 

""" 

if len(subnums) == 1: # if it's the only one, it's the lowest 

ret = True 

else: 

# not original scores 

old_scores = [self.words[x].score for x in subnums] 

subnum_range = range(len(subnums)) 

# get lowest score for all other numbers 

min_of_other = min([old_scores[x] for x in subnum_range if subnums[x] != i]) 

ret = self.words[i].score < min_of_other 

return ret 

 

def has_donation(self): 

return any([word.word_class == "donation" for word in self.words]) 

 

def sort_rate_donate(self): 

""" 

sorts chaos numbers, returns score for best, and whether or not the verwendungszweck indicated a donation 

:return: 

""" 

to_sort = [[self.words[x].word, self.words[x].score] for x in self.nums] 

to_sort = sorted(to_sort, key=lambda word: word[1]) 

sorted_chaos_numbers = [x[0] for x in to_sort] 

if len(to_sort) > 0: 

rating = to_sort[0][1] 

else: 

rating = None # will be set in error handling for failed chaos number in models.Transaction 

donation = False 

if self.has_donation(): 

donation = True 

return sorted_chaos_numbers, rating, donation 

 

 

def main(): 

""" 

trouble-shooting tester when not used as part of app 

:return: 

""" 

csv_in = open("../../testdata/PB_Umsatzauskunft_KtoNr0599090201_01-08-2015_2007.csv", encoding="iso-8859-1").readlines() 

csv_in = csv.reader(csv_in, delimiter=';') 

for item in csv_in: 

print("---") 

if len(item) == 8: 

x = TransactionReader(item) 

print(x) 

 

 

324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never trueif __name__ == "__main__": 

main()