1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
'''
Sparse Matrix
'''
import struct
import numpy as np
import bsddb
from cStringIO import StringIO
class DictMatrix():
def __init__(self, container = {}, dft = 0.0):
self._data = container
self._dft = dft
self._nums = 0
def __setitem__(self, index, value):
try:
i, j = index
except:
raise IndexError('invalid index')
ik = ('i%d' % i)
# 为了节省内存,我们把j, value打包成字二进制字符串
ib = struct.pack('if', j, value)
jk = ('j%d' % j)
jb = struct.pack('if', i, value)
try:
self._data[ik] += ib
except:
self._data[ik] = ib
try:
self._data[jk] += jb
except:
self._data[jk] = jb
self._nums += 1
def __getitem__(self, index):
try:
i, j = index
except:
raise IndexError('invalid index')
if (isinstance(i, int)):
ik = ('i%d' % i)
if not self._data.has_key(ik): return self._dft
ret = dict(np.fromstring(self._data[ik], dtype = 'i4,f4'))
if (isinstance(j, int)): return ret.get(j, self._dft)
if (isinstance(j, int)):
jk = ('j%d' % j)
if not self._data.has_key(jk): return self._dft
ret = dict(np.fromstring(self._data[jk], dtype = 'i4,f4'))
return ret
def __len__(self):
return self._nums
def __iter__(self):
pass
'''
从文件中生成matrix
考虑到dbm读写的性能不如内存,我们做了一些缓存,每1000W次批量写入一次
考虑到字符串拼接性能不太好,我们直接用StringIO来做拼接
'''
def from_file(self, fp, sep = '\t'):
cnt = 0
cache = {}
for l in fp:
if 10000000 == cnt:
self._flush(cache)
cnt = 0
cache = {}
i, j, v = [float(i) for i in l.split(sep)]
ik = ('i%d' % i)
ib = struct.pack('if', j, v)
jk = ('j%d' % j)
jb = struct.pack('if', i, v)
try:
cache[ik].write(ib)
except:
cache[ik] = StringIO()
cache[ik].write(ib)
try:
cache[jk].write(jb)
except:
cache[jk] = StringIO()
cache[jk].write(jb)
cnt += 1
self._nums += 1
self._flush(cache)
return self._nums
def _flush(self, cache):
for k,v in cache.items():
v.seek(0)
s = v.read()
try:
self._data[k] += s
except:
self._data[k] = s
if __name__ == '__main__':
db = bsddb.btopen(None, cachesize = 268435456)
data = DictMatrix(db)
data.from_file(open('/path/to/log.txt', 'r'), ',')
|