This commit is contained in:
anil
2020-09-03 10:11:15 +02:00
parent 1b57a9930c
commit 8eccfee179
6 changed files with 100 additions and 31 deletions
+34 -10
View File
@@ -105,6 +105,21 @@ def getMapping(pathMapping,x):
def decode_redis(src):
if isinstance(src, list):
rv = list()
for key in src:
rv.append(decode_redis(key))
return rv
elif isinstance(src, dict):
rv = dict()
for key in src:
rv[key.decode()] = decode_redis(src[key])
return rv
elif isinstance(src, bytes):
return src.decode()
else:
raise Exception("type not handled: " +type(src))
def cluster(clusterPath,pairsPath, level):
@@ -115,11 +130,17 @@ def cluster(clusterPath,pairsPath, level):
roots = [i for i in roots if not i.startswith('.')]
port = REDIS_PORT
redis_db = redis.StrictRedis(host="localhost", port=port, db=1)
filenames= decode_redis(redis_db.hgetall('filenames'))
pairsPath = filenames
roots = list(set([i.split('-')[0] for i in filenames.keys()]))
if level == 'tokens':
redis_db = redis.StrictRedis(host="localhost", port=port, db=3)
else:
redis_db = redis.StrictRedis(host="localhost", port=port, db=2)
keys = redis_db.hkeys("compared")
compared = pd.DataFrame(keys, columns=['pairs_key'])
compared['pairs_key'] = compared['pairs_key'].apply(lambda x: x.decode())
@@ -169,21 +190,24 @@ def clusterCore(clusterPath, level, match, pairsPath, root, s,action ,token=''):
logging.info('Cluster size %d',len(subgraph.nodes()))
cluster.append(subgraph.nodes())
cluster
pathMapping = dict()
if level == 'tokens':
indexFile = join(pairsPath, root, s,action+'.index')
elif level == 'actions':
indexFile = join(pairsPath, root, s + '.index')
# else:
# indexFile =join(pairsPath, root, s,action,token+'.index')
df = pd.read_csv(indexFile, header=None, usecols=[0, 1], index_col=[0])
pathMapping = df.to_dict()
# pathMapping = dict()
# if level == 'tokens':
# indexFile = join(pairsPath, root, s,action+'.index')
# elif level == 'actions':
# indexFile = join(pairsPath, root, s + '.index')
# # else:
# # indexFile =join(pairsPath, root, s,action,token+'.index')
# df = pd.read_csv(indexFile, header=None, usecols=[0, 1], index_col=[0])
# pathMapping = df.to_dict()
workList = []
for idx, clus in enumerate(cluster):
logging.info('exporting cluster %s %s %s %d', root,s,action,idx)
for f in clus:
dumpFile = pathMapping[1][int(f)]
# redis_db = redis.StrictRedis(host="localhost", port=6399, db=1)
# dumpFile = redis_db.hget("filenames",root+'-'+s+'-'+f)
dumpFile = pairsPath[root+'-'+s+'-'+f]
# dumpFile = pathMapping[1][int(f)]
t = dumpFile,root,level,clusterPath,s,action,token,idx
workList.append(t)