java dataset collector fix
This commit is contained in:
Binary file not shown.
@@ -1,13 +0,0 @@
|
||||
2020-04-07 17:31:35,222 - 31200 - INFO - commons.py:setEnv - ROOT_DIR : /Users/anilkoyuncu/projects/release/test/fixminer_source/python
|
||||
2020-04-07 17:31:35,224 - 31200 - INFO - commons.py:setEnv - REPO_PATH : /Users/anilkoyuncu/projects/test/richedit-data/datasets
|
||||
2020-04-07 17:31:35,224 - 31200 - INFO - commons.py:setEnv - CODE_PATH : /Users/anilkoyuncu/projects/release/test/fixminer_source/python/code/
|
||||
2020-04-07 17:31:35,224 - 31200 - INFO - commons.py:setEnv - COMMIT_DFS : /Users/anilkoyuncu/projects/test/richedit-data/commitsDF/
|
||||
2020-04-07 17:31:35,224 - 31200 - INFO - commons.py:setEnv - BUG_POINT : /Users/anilkoyuncu/projects/test/richedit-data/bugPoints/
|
||||
2020-04-07 17:31:35,224 - 31200 - INFO - commons.py:setEnv - COMMIT_FOLDER : /Users/anilkoyuncu/projects/test/richedit-data/commits/
|
||||
2020-04-07 17:31:35,225 - 31200 - INFO - commons.py:setEnv - FEATURE_DIR : /Users/anilkoyuncu/projects/test/richedit-data/features/
|
||||
2020-04-07 17:31:35,225 - 31200 - INFO - commons.py:setEnv - CLASSIFIER_DIR : /Users/anilkoyuncu/projects/test/richedit-data/classifiers/
|
||||
2020-04-07 17:31:35,225 - 31200 - INFO - commons.py:setEnv - PREDICTION_DIR : /Users/anilkoyuncu/projects/test/richedit-data/predictions/
|
||||
2020-04-07 17:31:35,225 - 31200 - INFO - commons.py:setEnv - DATASET_DIR : /Users/anilkoyuncu/projects/test/richedit-data/datasets/
|
||||
2020-04-07 17:31:41,346 - 31200 - INFO - commons.py:shellCallTemplate - JAVA_HOME='/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home' java -jar /Users/anilkoyuncu/projects/release/test/fixminer_source/target/FixPatternMiner-1.0.0-jar-with-dependencies.jar /Users/anilkoyuncu/projects/release/test/fixminer_source/src/main/resources/config.yml RICHEDITSCRIPT
|
||||
2020-04-07 17:31:41,551 - 31200 - ERROR - commons.py:shellCallTemplate - Error: Could not find or load main class edu.lu.uni.serval.richedit.Launcher
|
||||
|
||||
Binary file not shown.
@@ -97,6 +97,7 @@ def setEnv(args):
|
||||
os.environ["DATA_PATH"] = cfg['fixminer']['datapath']
|
||||
os.environ["PROJECT_TYPE"] = cfg['fixminer']['projectType']
|
||||
os.environ["PROJECT_LIST"] = cfg['fixminer']['projectList']
|
||||
os.environ["REDIS_PORT"] = str(cfg['fixminer']['portDumps'])
|
||||
|
||||
# import yaml
|
||||
#
|
||||
@@ -177,7 +178,22 @@ def getRun():
|
||||
|
||||
|
||||
|
||||
|
||||
# def shellCallTemplate(cmd,enc='utf-8'):
|
||||
# process = subprocess.Popen(cmd,
|
||||
# stdout=subprocess.PIPE,stderr=PIPE, shell=True,encoding=enc,
|
||||
# universal_newlines=True)
|
||||
#
|
||||
# while True:
|
||||
# output = process.stdout.readline()
|
||||
# print(output.strip())
|
||||
# # Do something else
|
||||
# return_code = process.poll()
|
||||
# if return_code is not None:
|
||||
# print('RETURN CODE', return_code)
|
||||
# # Process has finished, read rest of the output
|
||||
# for output in process.stdout.readlines():
|
||||
# print(output.strip())
|
||||
# break
|
||||
|
||||
def shellCallTemplate(cmd,enc='utf-8'):
|
||||
try:
|
||||
@@ -510,9 +526,9 @@ def get_class_weights(y):
|
||||
return {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}
|
||||
|
||||
|
||||
def stopDB(dbDir,portInner,dbName):
|
||||
cmd = "bash " + dbDir + "/" + "stopServer.sh " + " " + portInner;
|
||||
|
||||
def stopDB(dbDir,portInner):
|
||||
# cmd = "bash " + dbDir + "/" + "stopServer.sh " + " " + portInner;
|
||||
cmd = "redis-cli -p " + portInner + " shutdown save"
|
||||
o, e = shellGitCheckout(cmd)
|
||||
logging.info(o)
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/bin/bash
|
||||
source activate redisEnv
|
||||
|
||||
#source activate redisEnv
|
||||
redis-cli -p $1 shutdown save
|
||||
|
||||
+123
-55
@@ -1,64 +1,132 @@
|
||||
def createDS(subject):
|
||||
# # if job == 'clone':
|
||||
from commitCollector import *
|
||||
from common.commons import *
|
||||
from commitCollector import *
|
||||
DATA_PATH = os.environ["DATA_PATH"]
|
||||
COMMIT_DFS = os.environ["COMMIT_DFS"]
|
||||
# DATASET_PATH = '/Users/anilkoyuncu/projects/datasets'
|
||||
REPO_PATH = os.environ["REPO_PATH"]
|
||||
DATASET_PATH = os.environ["REPO_PATH"]
|
||||
DATASET = os.environ["dataset"]
|
||||
ROOT = os.environ["ROOT_DIR"]
|
||||
PROJECT_LIST = os.environ["PROJECT_LIST"]
|
||||
|
||||
caseClone(subject)
|
||||
# # elif job == 'collect':
|
||||
from commitCollector import *
|
||||
|
||||
caseCollect(subject)
|
||||
# # elif job == 'fix':
|
||||
from filterBugFixingCommits import caseFix
|
||||
|
||||
caseFix(subject)
|
||||
#
|
||||
# # elif job =='brDownload':
|
||||
from bugReportDownloader import caseBRDownload
|
||||
|
||||
caseBRDownload(subject)
|
||||
# # elif job =='brParser':
|
||||
from bugReportParser import step1
|
||||
|
||||
step1(subject)
|
||||
|
||||
# elif job =='dataset':
|
||||
|
||||
if not isfile(join(DATA_PATH, 'singleBR.pickle')):
|
||||
|
||||
brs = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle"))
|
||||
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
from otherDatasets import markBugFixingPatches
|
||||
|
||||
|
||||
def getCommit(x):
|
||||
bid, project = x
|
||||
def createDS():
|
||||
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
repo = subjects.query("Subject == '{0}'".format(project)).Repo.tolist()[0]
|
||||
commits = load_zipped_pickle(join(DATA_PATH, COMMIT_DFS, repo + '.pickle'))
|
||||
correspondingCommit = commits.query("fix =='{0}'".format(bid)).commit.tolist()
|
||||
if len(correspondingCommit) == 1:
|
||||
return [bid, correspondingCommit[0], project]
|
||||
else:
|
||||
return None
|
||||
print('error')
|
||||
pjList = PROJECT_LIST.split(',')
|
||||
if not os.path.exists(DATASET_PATH):
|
||||
os.mkdir(DATASET_PATH)
|
||||
|
||||
subjects = pd.read_csv(join(ROOT,'data', 'dataset.csv'))
|
||||
|
||||
|
||||
wl = brs[['bid', 'project']].values.tolist()
|
||||
dataL = parallelRunMerge(getCommit, wl)
|
||||
|
||||
commits = pd.DataFrame(
|
||||
columns=['bid', 'commit', 'project'],
|
||||
data=list(filter(None.__ne__, dataL)))
|
||||
|
||||
save_zipped_pickle(commits, join(DATA_PATH, 'singleBR.pickle'))
|
||||
if pjList == ['ALL']:
|
||||
tuples = subjects[['Repo','GitRepo','Branch']].values.tolist()
|
||||
else:
|
||||
commits = load_zipped_pickle(join(DATA_PATH, 'singleBR.pickle'))
|
||||
subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
logging.info('done matching commits')
|
||||
commits['repo'] = commits.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Repo.tolist()[0])
|
||||
# repos = subjects.query("Subject == '{0}'".format(subject)).Repo.tolist()
|
||||
tuples = subjects[subjects.Repo.isin(pjList)][['Repo', 'GitRepo','Branch']].values.tolist()
|
||||
|
||||
workList = commits[['commit', 'repo']].values.tolist()
|
||||
from dataset import prepareFiles
|
||||
for t in tuples:
|
||||
repo,src,branch = t
|
||||
logging.info(repo)
|
||||
if isfile(join(COMMIT_DFS,repo+'Fix.pickle')):
|
||||
commits = load_zipped_pickle(join(COMMIT_DFS,repo+'Fix.pickle'))
|
||||
else:
|
||||
cmd = 'git config --global http.postBuffer 157286400'
|
||||
shellCallTemplate(cmd)
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
getCommitFromRepo(join(REPO_PATH, repo), join(COMMIT_DFS, repo),branch)
|
||||
rDF = makeDF(join(COMMIT_DFS,repo + '.commits'))
|
||||
save_zipped_pickle(rDF, join(COMMIT_DFS, repo + ".pickle"))
|
||||
# return rDF
|
||||
commits = rDF
|
||||
commits = markBugFixingPatches(commits,repo)
|
||||
commits = commits[commits.files.apply(lambda x: np.any([i == 'M' for i in x.values()]))]
|
||||
# keep only commits that are changing c files (.c)
|
||||
commits = commits[commits.files.apply(lambda x: np.all([i.endswith('.java') for i in x.keys()]))]
|
||||
#not a revert commit
|
||||
# commits = commits[~commits.log.apply(lambda x: x.startswith('Revert'))]
|
||||
# commits = commits[commits.files.apply(lambda x: len(x) == 1)]
|
||||
# commits['cocci'] = commits.log.apply(lambda x: True if re.search('cocci|coccinelle', x) else False)
|
||||
# coccis = commits[commits.cocci].commit.values.tolist()
|
||||
fixes = commits[commits.fixes.str.len()!=0].commit.values.tolist()
|
||||
# links = commits[commits.links.str.len()!=0].commit.values.tolist()
|
||||
|
||||
parallelRun(prepareFiles, workList)
|
||||
# bugs = set(fixes).union(links).union(coccis)
|
||||
# bugs = set(fixes)#.union(coccis)
|
||||
commits = commits[commits.commit.isin(fixes)]
|
||||
print(len(commits))
|
||||
# for s in a.commit.values.tolist():
|
||||
from otherDatasets import prepareFiles
|
||||
parallelRun(prepareFiles,commits[['commit','files']].values.tolist(),repo)
|
||||
|
||||
# # if job == 'clone':
|
||||
# for repo,src in subjects[['Repo','GitRepo']].values.tolist():
|
||||
# if(pjList != ['ALL']):
|
||||
# if repo in pjList:
|
||||
# print(repo)
|
||||
# cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
# shellCallTemplate(cmd)
|
||||
# logging.info(repo)
|
||||
|
||||
# caseClone(subject)
|
||||
|
||||
# caseCollect(subject)
|
||||
# # elif job == 'fix':
|
||||
# from filterBugFixingCommits import caseFix
|
||||
#
|
||||
# caseFix(subject)
|
||||
# #
|
||||
# # # elif job =='brDownload':
|
||||
# from bugReportDownloader import caseBRDownload
|
||||
#
|
||||
# caseBRDownload(subject)
|
||||
# # # elif job =='brParser':
|
||||
# from bugReportParser import step1
|
||||
#
|
||||
# step1(subject)
|
||||
#
|
||||
# # elif job =='dataset':
|
||||
#
|
||||
# if not isfile(join(DATA_PATH, 'singleBR.pickle')):
|
||||
#
|
||||
# brs = load_zipped_pickle(join(DATA_PATH, subject + "bugReportsComplete.pickle"))
|
||||
#
|
||||
# subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
#
|
||||
#
|
||||
# def getCommit(x):
|
||||
# bid, project = x
|
||||
#
|
||||
# subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
# repo = subjects.query("Subject == '{0}'".format(project)).Repo.tolist()[0]
|
||||
# commits = load_zipped_pickle(join(DATA_PATH, COMMIT_DFS, repo + '.pickle'))
|
||||
# correspondingCommit = commits.query("fix =='{0}'".format(bid)).commit.tolist()
|
||||
# if len(correspondingCommit) == 1:
|
||||
# return [bid, correspondingCommit[0], project]
|
||||
# else:
|
||||
# return None
|
||||
# print('error')
|
||||
#
|
||||
#
|
||||
# wl = brs[['bid', 'project']].values.tolist()
|
||||
# dataL = parallelRunMerge(getCommit, wl)
|
||||
#
|
||||
# commits = pd.DataFrame(
|
||||
# columns=['bid', 'commit', 'project'],
|
||||
# data=list(filter(None.__ne__, dataL)))
|
||||
#
|
||||
# save_zipped_pickle(commits, join(DATA_PATH, 'singleBR.pickle'))
|
||||
# else:
|
||||
# commits = load_zipped_pickle(join(DATA_PATH, 'singleBR.pickle'))
|
||||
# subjects = pd.read_csv(join(DATA_PATH, 'subjects.csv'))
|
||||
# logging.info('done matching commits')
|
||||
# commits['repo'] = commits.project.apply(lambda x: subjects.query("Subject == '{0}'".format(x)).Repo.tolist()[0])
|
||||
#
|
||||
# workList = commits[['commit', 'repo']].values.tolist()
|
||||
# from dataset import prepareFiles
|
||||
#
|
||||
# parallelRun(prepareFiles, workList)
|
||||
+28
-24
@@ -15,6 +15,7 @@ if __name__ == '__main__':
|
||||
setEnv(args)
|
||||
|
||||
job = args.job
|
||||
job = "cluster"
|
||||
ROOT_DIR = os.environ["ROOT_DIR"]
|
||||
REPO_PATH = os.environ["REPO_PATH"]
|
||||
CODE_PATH = os.environ["CODE_PATH"]
|
||||
@@ -25,11 +26,12 @@ if __name__ == '__main__':
|
||||
FEATURE_DIR = os.environ["FEATURE_DIR"]
|
||||
DATASET_DIR = os.environ["DATASET_DIR"]
|
||||
PROJECT_TYPE = os.environ["PROJECT_TYPE"]
|
||||
REDIS_PORT = os.environ["REDIS_PORT"]
|
||||
jdk8 = os.environ["JDK8"]
|
||||
pd.options.mode.chained_assignment = None
|
||||
|
||||
|
||||
subject = 'ALL'
|
||||
# subject = 'ALL'
|
||||
rootType = 'if'
|
||||
|
||||
print(job)
|
||||
@@ -37,7 +39,7 @@ if __name__ == '__main__':
|
||||
|
||||
if job == 'dataset4j':
|
||||
from javaDS import createDS
|
||||
createDS(subject)
|
||||
createDS()
|
||||
# elif job == 'linuxDS':
|
||||
# from linuxDataset import collectBugFixPatches
|
||||
# collectBugFixPatches()
|
||||
@@ -45,6 +47,8 @@ if __name__ == '__main__':
|
||||
from otherDatasets import core
|
||||
core()
|
||||
elif job =='richEditScript':
|
||||
dbDir = join(DATA_PATH, 'redis')
|
||||
stopDB(dbDir, REDIS_PORT)
|
||||
cmd = "JAVA_HOME='" + jdk8 + "' java -jar " + join(Path(ROOT_DIR).parent, 'target','FixPatternMiner-1.0.0-jar-with-dependencies.jar') + " " + args.prop + " RICHEDITSCRIPT "
|
||||
output = shellCallTemplate(cmd)
|
||||
logging.info(output)
|
||||
@@ -64,7 +68,7 @@ if __name__ == '__main__':
|
||||
from pairs import importShape
|
||||
importShape()
|
||||
|
||||
elif job =='compareShapes':
|
||||
elif job =='compare':
|
||||
# cmd = "mvn exec:java -f '/data/fixminer_source/' -Dexec.mainClass='edu.lu.uni.serval.richedit.akka.compare.CompareTrees' -Dexec.args='"+ " shape " + join(DATA_PATH,"redis") +" ALLdumps-gumInput.rdb " + "clusterl0-gumInputALL.rdb /data/richedit-core/python/data/richEditScript'"
|
||||
cmd = "JAVA_HOME='" + jdk8 + "' java -jar " + join(Path(ROOT_DIR).parent, 'target','FixPatternMiner-1.0.0-jar-with-dependencies.jar') + " " + args.prop + " COMPARE "
|
||||
output = shellCallTemplate(cmd)
|
||||
@@ -81,27 +85,27 @@ if __name__ == '__main__':
|
||||
startDB(dbDir, "6399", PROJECT_TYPE)
|
||||
cluster(join(DATA_PATH,'shapes'),join(DATA_PATH, 'pairs'),'shapes',rootType)
|
||||
|
||||
elif job =='actionSI':
|
||||
from pairs import actionPairs
|
||||
actionPairs(rootType)
|
||||
|
||||
# elif job =='importActionPairs':
|
||||
from pairs import importAction
|
||||
importAction(rootType)
|
||||
|
||||
elif job =='compareActions':
|
||||
# cmd = "JAVA_HOME='"+jdk8+"' java -Xmx8096m -Djava.util.concurrent.ForkJoinPool.common.parallelism=64 -jar "+ join(DATA_PATH,'CompareTrees.jar') + " action " + join(DATA_PATH,"redis") +" ALLdumps-gumInput.rdb " + "clusterl1-gumInputALL.rdb"
|
||||
|
||||
cmd = "JAVA_HOME='" + jdk8 + "' java -jar " + join(DATA_PATH, 'FixPatternMiner-1.0.1.jar') + " " + join(DATA_PATH, 'app.properties') + " COMPARE " + 'L2'
|
||||
output = shellCallTemplate(cmd)
|
||||
logging.info(output)
|
||||
|
||||
elif job == 'clusterActions':
|
||||
from abstractPatch import cluster
|
||||
|
||||
dbDir = join(DATA_PATH, 'redis')
|
||||
startDB(dbDir, "6399", PROJECT_TYPE)
|
||||
cluster( join(DATA_PATH, 'actions'),join(DATA_PATH, 'pairsAction'),'actions',rootType)
|
||||
# elif job =='actionSI':
|
||||
# from pairs import actionPairs
|
||||
# actionPairs(rootType)
|
||||
#
|
||||
# # elif job =='importActionPairs':
|
||||
# from pairs import importAction
|
||||
# importAction(rootType)
|
||||
#
|
||||
# elif job =='compareActions':
|
||||
# # cmd = "JAVA_HOME='"+jdk8+"' java -Xmx8096m -Djava.util.concurrent.ForkJoinPool.common.parallelism=64 -jar "+ join(DATA_PATH,'CompareTrees.jar') + " action " + join(DATA_PATH,"redis") +" ALLdumps-gumInput.rdb " + "clusterl1-gumInputALL.rdb"
|
||||
#
|
||||
# cmd = "JAVA_HOME='" + jdk8 + "' java -jar " + join(DATA_PATH, 'FixPatternMiner-1.0.1.jar') + " " + join(DATA_PATH, 'app.properties') + " COMPARE " + 'L2'
|
||||
# output = shellCallTemplate(cmd)
|
||||
# logging.info(output)
|
||||
#
|
||||
# elif job == 'clusterActions':
|
||||
# from abstractPatch import cluster
|
||||
#
|
||||
# dbDir = join(DATA_PATH, 'redis')
|
||||
# startDB(dbDir, "6399", PROJECT_TYPE)
|
||||
# cluster( join(DATA_PATH, 'actions'),join(DATA_PATH, 'pairsAction'),'actions',rootType)
|
||||
|
||||
elif job == 'tokenSI':
|
||||
from pairs import tokenPairs
|
||||
|
||||
@@ -120,8 +120,9 @@ def prepareFiles(t,dsName):
|
||||
nonTest = []
|
||||
for k,v in files.items():
|
||||
if v == 'M':
|
||||
if k.endswith('.c') or k.endswith(u'.h'):
|
||||
nonTest.append(k)
|
||||
nonTest.append(k)
|
||||
# if k.endswith('.c') or k.endswith(u'.h'):
|
||||
# nonTest.append(k)
|
||||
# nonTest = [f for f in files.keys() if f.endswith('.c') or f.endswith(u'.h')]
|
||||
|
||||
cmd = 'git -C ' + join(DATASET_PATH,dsName) + ' rev-parse --short=6 ' + shaOld
|
||||
@@ -276,6 +277,8 @@ def core():
|
||||
if(pjList != ['ALL']):
|
||||
if repo in pjList:
|
||||
print(repo)
|
||||
cmd = 'git config --global http.postBuffer 157286400'
|
||||
shellCallTemplate(cmd)
|
||||
cmd = 'git -C ' + DATASET_PATH + ' clone ' + src
|
||||
shellCallTemplate(cmd)
|
||||
logging.info(repo)
|
||||
|
||||
Reference in New Issue
Block a user