143 lines
5.5 KiB
Python
143 lines
5.5 KiB
Python
from common.commons import *
|
|
DATA_PATH = os.environ["DATA_PATH"]
|
|
|
|
def core():
|
|
clusterPath = join(DATA_PATH, 'shapes')
|
|
roots = listdir(clusterPath)
|
|
roots = [i for i in roots if not (i.startswith('.') or i.endswith('.pickle'))]
|
|
pattern = {}
|
|
for root in roots:
|
|
root
|
|
sizes = listdir(join(clusterPath, root))
|
|
for size in sizes:
|
|
# actions = listdir(join(clusterPath,root,size))
|
|
# for action in actions:
|
|
clusters = listdir(join(clusterPath, root, size))
|
|
for cluster in clusters:
|
|
members = listdir(join(clusterPath, root, size, cluster))
|
|
|
|
# pattern[root+'/'+size+'/'+cluster]= root +'/' +size +'/'+ members[0]
|
|
pattern[root+'/'+size+'/'+cluster]= members[0]
|
|
pattern
|
|
|
|
from pairs import shapePairs
|
|
matches = shapePairs()
|
|
# 'FFmpeg','curl','nginx','openssl','redis','tmux','vlc']
|
|
matches = matches[matches.file.apply(lambda x: x in list(pattern.values()) or not ( x.startswith('linux_') or x.startswith('FFmpeg_') or x.startswith('curl_') or x.startswith('nginx_') or x.startswith('openssl_') or x.startswith('redis_') or x.startswith('tmux_') or x.startswith('vlc_')))]
|
|
from pairs import createPairs
|
|
createPairs(matches)
|
|
# # # elif job == 'importShapesPairs':
|
|
from pairs import importShape
|
|
importShape()
|
|
|
|
def checkWrongMembers():
|
|
clusterPath = join(DATA_PATH, 'shapes')
|
|
roots = listdir(clusterPath)
|
|
roots = [i for i in roots if not (i.startswith('.') or i.endswith('.pickle'))]
|
|
pattern = {}
|
|
for root in roots:
|
|
root
|
|
sizes = listdir(join(clusterPath, root))
|
|
for size in sizes:
|
|
# actions = listdir(join(clusterPath,root,size))
|
|
# for action in actions:
|
|
clusters = listdir(join(clusterPath, root, size))
|
|
for cluster in clusters:
|
|
members = listdir(join(clusterPath, root, size, cluster))
|
|
sizeDict = {}
|
|
for s in [(i,os.path.getsize(join(clusterPath, root, size, cluster,i))) for i in members]:
|
|
sizeDict[s[1]] = s[0]
|
|
sizeDict
|
|
if len(sizeDict) > 1:
|
|
print(join(clusterPath, root, size, cluster))
|
|
print(sizeDict.values())
|
|
|
|
def cluster():
|
|
clusterPath = join(DATA_PATH, 'shapes')
|
|
roots = listdir(clusterPath)
|
|
roots = [i for i in roots if not (i.startswith('.') or i.endswith('.pickle'))]
|
|
pattern = {}
|
|
for root in roots:
|
|
root
|
|
sizes = listdir(join(clusterPath, root))
|
|
for size in sizes:
|
|
# actions = listdir(join(clusterPath,root,size))
|
|
# for action in actions:
|
|
clusters = listdir(join(clusterPath, root, size))
|
|
for cluster in clusters:
|
|
members = listdir(join(clusterPath, root, size, cluster))
|
|
|
|
# pattern[root+'/'+size+'/'+cluster]= root +'/' +size +'/'+ members[0]
|
|
pattern[root+'/'+size+'/'+cluster]= members[0]
|
|
pattern
|
|
|
|
pairsPath = join(DATA_PATH, 'pairs')
|
|
from abstractPatch import loadPairMulti
|
|
for root in roots:
|
|
matches =loadPairMulti(root,'','shapes')
|
|
matches
|
|
sizes = matches['sizes'].unique().tolist()
|
|
for s in sizes:
|
|
match = matches[matches['sizes'] == s]
|
|
match
|
|
clusterCore(pattern,clusterPath, 'shapes', match, pairsPath, root, s, '')
|
|
|
|
def clusterCore(pattern,clusterPath, level, match, pairsPath, root, s,action ,token=''):
|
|
col_combi = match.tuples.values.tolist()
|
|
import networkx
|
|
g = networkx.Graph(col_combi)
|
|
cluster = []
|
|
for subgraph in networkx.connected_component_subgraphs(g):
|
|
logging.info('Cluster size %d',len(subgraph.nodes()))
|
|
cluster.append(subgraph.nodes())
|
|
cluster
|
|
pathMapping = dict()
|
|
if level == 'actions':
|
|
indexFile = join(pairsPath, root, s,action+'.index')
|
|
elif level == 'shapes':
|
|
indexFile = join(pairsPath, root, s + '.index')
|
|
else:
|
|
indexFile =join(pairsPath, root, s,action,token+'.index')
|
|
df = pd.read_csv(indexFile, header=None, usecols=[0, 1], index_col=[0])
|
|
pathMapping = df.to_dict()
|
|
|
|
workList = []
|
|
|
|
exportCLusters ={}
|
|
if not os.path.exists(join(clusterPath, root, s)):
|
|
print()
|
|
existingClusters = 0
|
|
else:
|
|
existingClusters = len(listdir(join(clusterPath, root, s)))
|
|
|
|
for clus in cluster:
|
|
members = [pathMapping[1][int(i)] for i in clus]
|
|
members
|
|
potentialClusters = [(key, value) for key, value in pattern.items() if key.startswith(root + '/' + s)]
|
|
potentialClusters
|
|
foundExisting = False
|
|
for pc,pcMember in potentialClusters:
|
|
if pcMember in members:
|
|
pc
|
|
foundExisting = True
|
|
exportCLusters[pc.split('/')[-1]] = members
|
|
if not foundExisting:
|
|
exportCLusters[existingClusters] = members
|
|
existingClusters= existingClusters+1
|
|
exportCLusters
|
|
for k,v in exportCLusters.items():
|
|
for f in v:
|
|
t = f, root, level, clusterPath, s, action, token, k
|
|
workList.append(t)
|
|
# for idx, clus in enumerate(cluster):
|
|
# logging.info('exporting cluster %s %s %s %d', root,s,action,idx)
|
|
# for f in clus:
|
|
# dumpFile = pathMapping[1][int(f)]
|
|
#
|
|
# t = dumpFile,root,level,clusterPath,s,action,token,idx
|
|
# workList.append(t)
|
|
from abstractPatch import dumpFilesCore
|
|
parallelRun(dumpFilesCore,workList)
|
|
# for wl in workList:
|
|
# dumpFilesCore(wl)
|