Add deep learning models.

This commit is contained in:
Kui LIU
2017-08-02 23:49:27 +02:00
parent cbddfdec34
commit baf6b06eed
26 changed files with 1761 additions and 31 deletions
+12
View File
@@ -16,6 +16,18 @@
</properties>
<dependencies>
<dependency>
<groupId>edu.lu.uni.serval</groupId>
<artifactId>MyCluster</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>edu.lu.uni.serval</groupId>
<artifactId>MyFeatureLearner</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>edu.lu.uni</groupId>
<artifactId>simple-utils</artifactId>
@@ -0,0 +1,24 @@
package edu.lu.uni.serval.FixPattern.info;
import edu.lu.uni.serval.gumtree.regroup.HierarchicalActionSet;
import edu.lu.uni.serval.gumtree.regroup.SimpleTree;
public class FixPattern {
private SimpleTree buggyCodeTree; // it will be used to compute the similarity.
private HierarchicalActionSet editScripts; // it will be used to generate new patches.
public SimpleTree getBuggyCodeTree() {
return buggyCodeTree;
}
public HierarchicalActionSet getEditScripts() {
return editScripts;
}
public FixPattern(SimpleTree buggyCodeTree, HierarchicalActionSet editScripts) {
super();
this.buggyCodeTree = buggyCodeTree;
this.editScripts = editScripts;
}
}
@@ -0,0 +1,163 @@
package edu.lu.uni.serval.FixPattern.info;
//import java.io.File;
//import java.io.IOException;
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
import org.eclipse.jdt.core.dom.ASTParser;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//import com.github.gumtreediff.actions.ActionGenerator;
//import com.github.gumtreediff.actions.model.Action;
//import com.github.gumtreediff.gen.jdt.JdtTreeGenerator;
//import com.github.gumtreediff.gen.jdt.cd.CdJdtTreeGenerator;
//import com.github.gumtreediff.matchers.Matcher;
//import com.github.gumtreediff.matchers.Matchers;
import com.github.gumtreediff.tree.ITree;
import com.github.gumtreediff.tree.TreeContext;
import edu.lu.uni.serval.FixPattern.utils.ASTNodeMap;
import edu.lu.uni.serval.gen.jdt.exp.ExpJdtTreeGenerator;
@Deprecated
public class GumTreeAnalysis {
// private static void analyzeBugFixes(String gitRepoPath, String outputPath) {
// log.info("Repo: " + gitRepoPath);
//
// GitTraveller gitTraveller = new GitTraveller(gitRepoPath, outputPath);
// gitTraveller.travelGitRepo();
//// Map<String, List<CommitFile>> commitFiles = gitTraveller.getCommitFiles();
//
// List<MyDiffEntry> allDiffEntries = gitTraveller.getAllDiffEntries();
// String previousFilesPath = gitTraveller.getPreviousFilesPath();
// String revisedFilesPath = gitTraveller.getRevisedFilesPath();
// for (MyDiffEntry diff : allDiffEntries) {
// String fileA = previousFilesPath + diff.getPrevFile();
// String fileB = revisedFilesPath + diff.getRevFile();
// List<String> gumTreeResults = GumTreeAnalysis.compareTwoFilesWithGumTree(fileA, fileB);
// if (gumTreeResults.size() == 0) {
// continue;
// }
// StringBuilder builder = new StringBuilder();
// builder.append("Previous File: " + fileA + "\n");
// builder.append("Revised File: " + fileB + "\n");
// String diffs = "";
// for (ModifiedDetails md : diff.getModifiedDetails()) {
// diffs += md.getLineNumber() + "\n";
// diffs += md.getFragment() + "\n";
// }
// builder.append("DiffEntry: " + diffs);
// for (String gumTreeResult : gumTreeResults) {
// builder.append(gumTreeResult.toString() + "\n");
// }
// FileHelper.outputToFile("OUTPUT/GumTreeResults/" + FileHelper.getRepositoryName(gitRepoPath) + "/" + diff.getRevFile().replace(".java", ".txt"), builder, false);
// }
//
//// DiffEntryParser diffEntryParser = new DiffEntryParser(allDiffEntries);
//// diffEntryParser.parseDiffEntries();
////
//// // <String, List>: String ==> revisedFileName.
//// Map<String, List<ModifiedFragment>> parsedDiffEntries = diffEntryParser.getParsedDiffEntries();
//// diffEntryParser = null;
//// allDiffEntries = null;
////
//// gitTraveller = null;
////
//// for (Map.Entry<String, List<ModifiedFragment>> entry : parsedDiffEntries.entrySet()) {
//// String revisedFileName = entry.getKey();
//// String fileA = previousFilesPath + "prev_" + revisedFileName;
//// String fileB = revisedFilesPath + revisedFileName;
//// System.err.println("FileName" + fileA);
//// List<String> gumTreeResults = GumTreeAnalysis.compareTwoFilesWithGumTree(fileA, fileB);
//// StringBuilder builder = new StringBuilder();
//// builder.append("Previous File: " + fileA + "\n");
//// builder.append("Revised File: " + fileB + "\n");
//// builder.append("DiffEntry: ");
//// for (String gumTreeResult : gumTreeResults) {
//// builder.append(gumTreeResult + "\n");
//// }
//// FileHelper.outputToFile("OUTPUT/GumTreeResults/" + FileHelper.getRepositoryName(gitRepoPath) + "/" + revisedFileName.replace(".java", ".txt"), builder, false);
//// }
//
// }
//
// public static List<String> compareTwoFilesWithGumTree(String prevFile, String revFile) {
// List<String> gumTreeResults = new ArrayList<String>();
//
// try {
//// TreeContext tc1 = new ExpJdtTreeGenerator().generateFromFile(prevFile);
//// TreeContext tc2 = new ExpJdtTreeGenerator().generateFromFile(revFile);
//// TreeContext tc1 = new JdtTreeGenerator().generateFromFile(prevFile);
//// TreeContext tc2 = new JdtTreeGenerator().generateFromFile(revFile);
// TreeContext tc1 = new RowTokenJdtTreeGenerator().generateFromFile(prevFile);
// TreeContext tc2 = new RowTokenJdtTreeGenerator().generateFromFile(revFile);
//// TreeContext tc1 = new CdJdtTreeGenerator().generateFromFile(prevFile);
//// TreeContext tc2 = new CdJdtTreeGenerator().generateFromFile(revFile);
// ITree t1 = tc1.getRoot();
// ITree t2 = tc2.getRoot();
//
// Matcher m = Matchers.getInstance().getMatcher(t1, t2);
// m.match();
//
// ActionGenerator ag = new ActionGenerator(t1, t2, m.getMappings());
// ag.generate();
//
// List<Action> actions = ag.getActions();
// for(Action ac : actions){
// String actionStr = parseAction(ac.toString());
// gumTreeResults.add(actionStr);
// }
//
// } catch (IOException e) {
// e.printStackTrace();
// }
// return gumTreeResults;
// }
// private static String parseAction(String actStr) {
// // UPD 25@@!a from !a to isTrue(a) at 69
// String[] actStrArrays = actStr.split("@@");
// actStr = "";
// int length = actStrArrays.length;
// for (int i = 0; i < length - 1; i++) {
// String actStrFrag = actStrArrays[i];
// int index = actStrFrag.lastIndexOf(" ") + 1;
// String nodeType = actStrFrag.substring(index);
// String backup = nodeType;
// try {
// nodeType = ASTNodeMap.map.get(Integer.parseInt(nodeType));
// } catch (NumberFormatException e) {
// nodeType = backup;
// log.info(actStr);
// }
// actStrFrag = actStrFrag.substring(0, index) + nodeType + "@@";
// actStr += actStrFrag;
// }
// actStr += actStrArrays[length - 1];
// return actStr;
// }
private static String parseAction(String actStr) {
// UPD 25@@!a from !a to isTrue(a) at 69
String[] actStrArrays = actStr.split("@@");
actStr = "";
int length = actStrArrays.length;
for (int i =0; i < length - 1; i ++) {
String actStrFrag = actStrArrays[i];
int index = actStrFrag.lastIndexOf(" ") + 1;
String nodeType = actStrFrag.substring(index);
nodeType = ASTNodeMap.map.get(Integer.parseInt(nodeType));
actStrFrag = actStrFrag.substring(0, index) + nodeType + "@@";
actStr += actStrFrag;
}
actStr += actStrArrays[length - 1];
return actStr;
}
}
@@ -0,0 +1,35 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for tokens embedding of edit scripts.
*
* Input data: parsed results of patches with GumTree.
*
* @author kui.liu
*
*/
public class Step1 {
public static void main(String[] args) {
String editScriptsFile = Configuration.EDITSCRIPT_SIZES_FILE;
String patchesSourceCodeFile = Configuration.PATCH_SOURCECODE_FILE;
String buggyTokensFile = Configuration.BUGGY_CODY_TOKENS_FILE;
String editScriptSizesFile = Configuration.EDITSCRIPT_SIZES_FILE;
FileHelper.deleteFile(editScriptsFile);
FileHelper.deleteFile(patchesSourceCodeFile);
FileHelper.deleteFile(buggyTokensFile);
FileHelper.deleteFile(editScriptSizesFile);
String selectedEditScripts = Configuration.SELECTED_EDITSCRIPTES_FILE;
String selectedPatches = Configuration.SELECTED_PATCHES_SOURE_CODE_FILE;
String selectedBuggyTokens = Configuration.SELECTED_BUGGY_TOKEN_FILE;
FileHelper.deleteFile(selectedEditScripts);
FileHelper.deleteFile(selectedPatches);
FileHelper.deleteFile(selectedBuggyTokens);
DataPreparation.prepareDataForTokenEmbedding();
}
}
@@ -0,0 +1,33 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.TokenEmbedder;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for evaluation.
*
* Embed tokens of source code vectors of training data and testing data.
*
* @author kui.liu
*
*/
public class Step10 {
public static void main(String[] args) {
boolean isSupervisedLearning = true;
if (isSupervisedLearning) {// supervised learning
String outputFileName = Configuration.EMBEDDED_ALL_TOKENS2;
FileHelper.deleteFile(outputFileName);
// Data pre-processing
TokenEmbedder embedder2 = new TokenEmbedder();
embedder2.embedTokensOfSourceCodeForSupervisedTesting();
} else { // un-supervised learning
String outputFileName = Configuration.EMBEDDED_ALL_TOKENS1;
FileHelper.deleteFile(outputFileName);
// Data pre-processing
TokenEmbedder embedder2 = new TokenEmbedder();
embedder2.embedTokensOfSourceCodeForUnsupervisedTesting();
}
}
}
@@ -0,0 +1,37 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.util.Map;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for evaluation.
*
* Vectorize data for deep learning.
*
* @author kui.liu
*
*/
public class Step11 {
public static void main(String[] args) {
boolean isSupervisedLearning = true;
if (isSupervisedLearning) {// supervised learning
String trainingDataPath = Configuration.TRAINING_DATA;
FileHelper.deleteFile(trainingDataPath);
String testingDataPath = Configuration.TESTING_DATA;
FileHelper.deleteDirectory(testingDataPath);
Map<Integer, Integer> commonClustersMappingLabel = DataPreparation.readCommonCLusters();
DataPreparation.prepareDataForFeatureLearningOfEvaluation2(commonClustersMappingLabel);
} else { // un-supervised learning
String outputData = Configuration.VECTORIED_ALL_SOURCE_CODE1;
FileHelper.deleteFile(outputData);
// Before embedding tokens.
// List<File> files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list");
DataPreparation.prepareDataForFeatureLearningOfEvaluation1();
}
}
}
@@ -0,0 +1,40 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.io.File;
import java.util.List;
import edu.lu.uni.serval.FixPatternMining.FeatureLearner;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Evaluation: extract features of testing data and predict their labels.
*
* @author kui.liu
*
*/
public class Step12 {
public static void main(String[] args) {
boolean isSupervisedLearning = true;
if (isSupervisedLearning) {// supervised learning
List<File> testingDataFiles = FileHelper.getAllFilesInCurrentDiectory(Configuration.TESTING_DATA, ".csv");
for (int i = 0, size = testingDataFiles.size(); i < size; i ++) {
if (i == 0) {
// TODO: we can test this model by our clustered resutls.
FeatureLearner learner2 = new FeatureLearner();
learner2.learnFeaturesOfSourceCode2(testingDataFiles.get(i));
} else {
FeatureLearner learner2 = new FeatureLearner();
learner2.learnFeaturesOfSourceCode3(testingDataFiles.get(i));
}
}
} else { // un-supervised learning
FeatureLearner learner2 = new FeatureLearner();
learner2.learnFeaturesOfSourceCode();
// Extracted Features: Configuration.EXTRACTED_FEATURES_TESTING;
// Compute the similarity: cosin similarity
}
}
}
@@ -0,0 +1,33 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.io.File;
import java.util.List;
import edu.lu.uni.serval.FixPatternMining.FeatureLearner;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Evaluation: extract features of testing data and predict their labels.
*
* @author kui.liu
*
*/
public class Step13 {
public static void main(String[] args) {
boolean isSupervisedLearning = true;
if (isSupervisedLearning) {// supervised learning
// label --> possibility --> 90, 80, 70, 60 others ignored, level one localization
// label: clusterNum, re-compute similarity with each element. 90, 80, 70, 60.
// similarity: patches --> fixing bug.
List<File> testingDataFiles = FileHelper.getAllFilesInCurrentDiectory(Configuration.TESTING_DATA, ".csv");
for (int i = 0, size = testingDataFiles.size(); i < size; i ++) {
}
} else { // un-supervised learning
// Extracted Features: Configuration.EXTRACTED_FEATURES_TESTING;
// Compute the similarity: cosin similarity
}
}
}
@@ -0,0 +1,25 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.TokenEmbedder;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Embed tokens of all selected edit scripts.
*
* Input data: all tokens of selected edit scripts.
*
* @author kui.liu
*
*/
public class Step2 {
public static void main(String[] args) {
String outputFileName = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS;
FileHelper.deleteFile(outputFileName);
TokenEmbedder embedder = new TokenEmbedder();
embedder.embedTokensOfEditScripts();
}
}
@@ -0,0 +1,24 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for features learning of selected edit scripts.
*
* Vectorize edit scripts with embedded tokens of edit scripts.
*
* @author kui.liu
*
*/
public class Step3 {
public static void main(String[] args) {
String vectorizedEditScripts = Configuration.VECTORIED_EDIT_SCRIPTS;
FileHelper.deleteFile(vectorizedEditScripts);
DataPreparation.prepareDataForFeatureLearning();
}
}
@@ -0,0 +1,25 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.FeatureLearner;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Learn features of all selected edit scripts with CNN algorithm.
*
* Input data: vectorized edit scripts.
*
* @author kui.liu
*
*/
public class Step4 {
public static void main(String[] args) {
String extractedFeatures = Configuration.EXTRACTED_FEATURES;
FileHelper.deleteDirectory(extractedFeatures);
FeatureLearner learner = new FeatureLearner();
learner.learnFeatures();
}
}
@@ -0,0 +1,24 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for clustering of edit scripts.
*
* Input data: learned features of edit scripts by CNN.
*
* @author kui.liu
*
*/
public class Step5 {
public static void main(String[] args) {
String clusterInput = Configuration.CLUSTER_INPUT;
FileHelper.deleteFile(clusterInput);
DataPreparation.prepareDataForClustering();
}
}
@@ -0,0 +1,23 @@
package edu.lu.uni.serval.FixPatternMining.App;
import edu.lu.uni.serval.FixPatternMining.Cluster;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Clustering of edit scripts with extracted features of edit scripts.
*
* @author kui.liu
*
*/
public class Step6 {
public static void main(String[] args) {
String clusterOutput = Configuration.CLUSTER_OUTPUT;
FileHelper.deleteFile(clusterOutput);
Cluster cluster = new Cluster();
cluster.cluster();
}
}
@@ -0,0 +1,47 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.util.List;
import java.util.Map;
import edu.lu.uni.serval.FixPatternMining.ClusterAnalyser;
import edu.lu.uni.serval.FixPatternMining.CommonPatterns;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Analyze cluster results to obtain common fix patterns.
*
* @author kui.liu
*
*/
public class Step7 {
public static void main(String[] args) {
String clusteredPatches = Configuration.CLUSTERED_PATCHES_FILE;
String clusteredBuggyTokens = Configuration.CLUSTERED_TOKENSS_FILE;
FileHelper.deleteDirectory(clusteredPatches);
FileHelper.deleteDirectory(clusteredBuggyTokens);
// analyze cluster results.
ClusterAnalyser analyser = new ClusterAnalyser();
analyser.readClusterResutls();
analyser.clusterPatchSourceCode();
analyser.clusterBuggyCodeTokens(); // the results will be used to compute similarity with target java code to localize bugs.
List<Integer> clusterResults = analyser.getClusterResults();
// Common patterns.
CommonPatterns commonPatterns = new CommonPatterns(); // Metrics TODO
// <Integer, Integer>: <ClusterNum, Label for supervised learning>
Map<Integer, Integer> commonClustersMappingLabel = commonPatterns.identifyCommonPatterns(clusterResults);
String clusterMappingLabel = "Label : ClusterNum\n";
for (Map.Entry<Integer, Integer> entry : commonClustersMappingLabel.entrySet()) {
clusterMappingLabel += entry.getValue() + " : " + entry.getKey() + "\n";
}
FileHelper.outputToFile(Configuration.CLUSTERNUMBER_LABEL_MAP, clusterMappingLabel, false);
int totalNumberOfTrainingData = commonPatterns.getTotalNumberofTrainingData();
FileHelper.outputToFile(Configuration.NUMBER_OF_TRAINING_DATA, "" + totalNumberOfTrainingData, false);
}
}
@@ -0,0 +1,33 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.io.File;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.evaluation.ProjectScanner;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare testing data for evaluation.
*
* Parse java projects to get the token vectors of all statements.
*
* @author kui.liu
*
*/
public class Step8 {
public static void main(String[] args) {
String outputLocalizeFile = Configuration.TEST_POSITION_FILE;
String outputTokensFile = Configuration.TEST_DATA_FILE;
FileHelper.deleteDirectory(outputLocalizeFile);
FileHelper.deleteDirectory(outputTokensFile);
int limitationOfTestingInstances = Integer.parseInt(FileHelper.readFile(Configuration.NUMBER_OF_TRAINING_DATA).trim()) / 10;
File testProjects = new File(Configuration.TEST_INPUT);
File[] projects = testProjects.listFiles();
ProjectScanner scanner = new ProjectScanner();
scanner.scanJavaProject(projects, outputLocalizeFile, outputTokensFile, limitationOfTestingInstances);
}
}
@@ -0,0 +1,33 @@
package edu.lu.uni.serval.FixPatternMining.App;
import java.util.Map;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for evaluation.
*
* Merge token vectors of source code of training data and testing data.
*
* @author kui.liu
*
*/
public class Step9 {
public static void main(String[] args) {
boolean isSupervisedLearning = true;
if (isSupervisedLearning) {// supervised learning
Map<Integer, Integer> commonClustersMappingLabel = DataPreparation.readCommonCLusters();
String outputFile = Configuration.EMBEDDING_DATA_TOKENS2;
FileHelper.deleteFile(outputFile);
// Data merge
DataPreparation.prepareTokensForEvaluation2(commonClustersMappingLabel);
} else { // un-supervised learning
// Data merge
DataPreparation.prepareTokensForEvaluation1();
}
}
}
@@ -0,0 +1,45 @@
package edu.lu.uni.serval.FixPatternMining;
import edu.lu.uni.serval.Clusters.XMeansCluster;
import edu.lu.uni.serval.config.Configuration;
import weka.core.EuclideanDistance;
/**
* Cluster features with X-means clustering algorithm.
*
* @author kui.liu
*
*/
public class Cluster {
public void cluster() {
String arffFile = Configuration.CLUSTER_INPUT;
String clusterResults = Configuration.CLUSTER_OUTPUT;
XMeansCluster cluster = new XMeansCluster();
try {
/*
* The below 5 parameters have default values.
*/
cluster.setDistanceF(new EuclideanDistance());
cluster.setUseKDTree(true);
cluster.setMaxNumberOfIterations(1000);
// The below 2 parameters are recommended to be the same.
cluster.setMaxKMeans(200);
cluster.setMaxKMeansForChildren(200);
/*
* The values of the below 3 parameters should be set by developers.
*/
cluster.setSeed(200);
cluster.setMaxNumClusters(100);
cluster.setMinNumClusters(1);
// X-means clustering is beginning.
cluster.cluster(arffFile, clusterResults);
// X-means clustering is finished.
} catch (Exception e) {
e.printStackTrace();
}
}
}
@@ -0,0 +1,146 @@
package edu.lu.uni.serval.FixPatternMining;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.utils.FileHelper;
public class ClusterAnalyser {
private List<Integer> clusterResults; // each element is a cluster number.
public void readClusterResutls() {
clusterResults = DataPreparation.readClusterResults();
}
public void clusterBuggyCodeTokens() {
String selectedTokens = Configuration.SELECTED_BUGGY_TOKEN_FILE;
String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE;
FileInputStream fis = null;
Scanner scanner = null;
Map<Integer, StringBuilder> builderMap = new HashMap<>();
Map<Integer, Integer> countersMap = new HashMap<>();
try {
fis = new FileInputStream(selectedTokens);
scanner = new Scanner(fis);
int index = 0;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
int clusterNum = clusterResults.get(index);
StringBuilder builder = getBuilder(builderMap, clusterNum);
builder.append(line).append("\n");
int counter = getCounter(countersMap, clusterNum);
if (counter % 1000 == 0) {
FileHelper.outputToFile(clusteredTokens + "Tokens_" + clusterNum + ".list", builder, true);
builder.setLength(0);
builderMap.put(clusterNum, builder);
}
index ++;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
scanner.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
for (Map.Entry<Integer, StringBuilder> entry : builderMap.entrySet()) {
int clusterNum = entry.getKey();
StringBuilder builder = entry.getValue();
FileHelper.outputToFile(clusteredTokens + "Tokens_" + clusterNum + ".list", builder, true);
builder.setLength(0);
}
}
public void clusterPatchSourceCode() {
String selectedPatches = Configuration.SELECTED_PATCHES_SOURE_CODE_FILE;
String clusteredPatches = Configuration.CLUSTERED_PATCHES_FILE;
FileInputStream fis = null;
Scanner scanner = null;
Map<Integer, StringBuilder> builderMap = new HashMap<>();
Map<Integer, Integer> countersMap = new HashMap<>();
try {
fis = new FileInputStream(selectedPatches);
scanner = new Scanner(fis);
String singlePatch = "";
int index = -1;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if ("".equals(line)) continue;
if ("PATCH###".equals(line)) {
if (!"".equals(singlePatch)) {
int clusterNum = clusterResults.get(index);
StringBuilder builder = getBuilder(builderMap, clusterNum);
builder.append(singlePatch);
int counter = getCounter(countersMap, clusterNum);
if (counter % 1000 == 0) {
FileHelper.outputToFile(clusteredPatches + "PatchesCluster_" + clusterNum + ".list", builder, true);
builder.setLength(0);
builderMap.put(clusterNum, builder);
}
}
singlePatch = "";
index ++;
}
singlePatch += line + "\n";
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
scanner.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
for (Map.Entry<Integer, StringBuilder> entry : builderMap.entrySet()) {
int clusterNum = entry.getKey();
StringBuilder builder = entry.getValue();
FileHelper.outputToFile(clusteredPatches + "PatchesCluster_" + clusterNum + ".list", builder, true);
builder.setLength(0);
}
}
private int getCounter(Map<Integer, Integer> countersMap, int clusterNum) {
int counter = 1;
if (countersMap.containsKey(clusterNum)) {
counter += countersMap.get(clusterNum);
}
countersMap.put(clusterNum, counter);
return counter;
}
private StringBuilder getBuilder(Map<Integer, StringBuilder> builderMap, int clusterNum) {
if (builderMap.containsKey(clusterNum)) {
return builderMap.get(clusterNum);
} else {
StringBuilder builder = new StringBuilder();
builderMap.put(clusterNum, builder);
return builder;
}
}
public List<Integer> getClusterResults() {
return clusterResults;
}
}
@@ -0,0 +1,35 @@
package edu.lu.uni.serval.FixPatternMining;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import edu.lu.uni.serval.utils.FileHelper;
public class ClusterResults {
/**
* Read the cluster results from the file of cluster results.
*
* @param clusterResultsFile, the file of cluster results.
* @return List<Integer>, each integer is a cluster number.
* @throws IOException
*/
public static List<Integer> readClusterResults(File clusterResultsFile) throws IOException {
List<Integer> clusterResultsList = new ArrayList<>();
String clusterResults = FileHelper.readFile(clusterResultsFile);
BufferedReader reader = new BufferedReader(new StringReader(clusterResults));
String line = null;
while ((line = reader.readLine()) != null) {
int cluster = Integer.parseInt(line);
clusterResultsList.add(cluster);
}
reader.close();
return clusterResultsList;
}
}
@@ -0,0 +1,72 @@
package edu.lu.uni.serval.FixPatternMining;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.utils.MapSorter;
public class CommonPatterns {
private static final int LEAST_NUMBER = 100;
private int totalNumberofTrainingData = 0;
public Map<Integer, Integer> identifyCommonPatterns(List<Integer> clusterResults) {
Map<Integer, List<Integer>> clusterMap = DataPreparation.readClusterResult(clusterResults);
// TODO how to select the common patterns, number or ratio?
List<Integer> commonClusterNum = getCommonClustersByNumber(clusterMap); // Integer: clusterNum.
Map<Integer, Integer> clusterNumMapLabel = new HashMap<>(); // <ClusterNum, Label for supervised learning>
for (int i = 0, size = commonClusterNum.size(); i < size; i ++) {
clusterNumMapLabel.put(commonClusterNum.get(i), i);
}
return clusterNumMapLabel;
}
private List<Integer> getCommonClustersByNumber(Map<Integer, List<Integer>> clusterMap) {
List<Integer> commonClusterNum = new ArrayList<>();
for (Map.Entry<Integer, List<Integer>> entry : clusterMap.entrySet()) {
List<Integer> elements = entry.getValue();
int size = elements.size();
if (size >= LEAST_NUMBER) { // TODO how to set this threshold?
commonClusterNum.add(entry.getKey());
totalNumberofTrainingData += size;
}
}
return commonClusterNum;
}
private List<Integer> getCommonClustersByRatio(Map<Integer, List<Integer>> clusterMap, List<Integer> clusterResults) {
List<Integer> commonClusterNum = new ArrayList<>();
double sizes = (double) clusterResults.size();
Map<Integer, Double> ratios = new HashMap<>();
for (Map.Entry<Integer, List<Integer>> entry : clusterMap.entrySet()) {
List<Integer> elements = entry.getValue();
ratios.put(entry.getKey(), (double) elements.size() / sizes);
}
MapSorter<Integer, Double> sorter = new MapSorter<Integer, Double>();
ratios = sorter.sortByValueDescending(ratios);
double counterRatio = 0.0;
for (Map.Entry<Integer, Double> entry : ratios.entrySet()) {
counterRatio += entry.getValue();
commonClusterNum.add(entry.getKey());
totalNumberofTrainingData += clusterMap.get(entry.getKey()).size();
if (counterRatio >= 0.8) { // TODO: how to set the value of this threshold?
break;
}
}
return commonClusterNum;
}
public int getTotalNumberofTrainingData() {
return totalNumberofTrainingData;
}
}
@@ -0,0 +1,560 @@
package edu.lu.uni.serval.FixPatternMining.DataPrepare;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.MaxSizeSelector.MaxSizeType;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.data.DataPreparer;
import edu.lu.uni.serval.utils.FileHelper;
/**
* Prepare data for fix patterns mining and evaluation.
*
* @author kui.liu
*
*/
public class DataPreparation {
/**
* Prepare data for token embedding in the process of fix patterns mining.
*/
public static void prepareDataForTokenEmbedding() {
// Collect all data into one file.
String editScriptsFilePath = Configuration.EDITSCRIPTS_FILE_PATH;
String patchesSourceCodeFilePath = Configuration.PATCH_SOURCECODE_FILE_PATH;
String buggyTokensFilePath = Configuration.BUGGYTREE_FILE_PATH;
String editScriptSizesFilePath = Configuration.EDITSCRIPT_SIZES_FILE_PATH;
String editScriptsFile = Configuration.EDITSCRIPT_SIZES_FILE;
String patchesSourceCodeFile = Configuration.PATCH_SOURCECODE_FILE;
String buggyTokensFile = Configuration.BUGGY_CODY_TOKENS_FILE;
String editScriptSizesFile = Configuration.EDITSCRIPT_SIZES_FILE;
File file = new File(editScriptsFilePath);
File[] subFiles = file.listFiles();
// Merge results of parsed patches.
for (File subFile : subFiles) {
String fileName = subFile.getName(); // edistScripts file
String id = fileName.substring(fileName.lastIndexOf("_"));
FileHelper.outputToFile(editScriptsFile, FileHelper.readFile(subFile), true);
String patchesSourceCode = patchesSourceCodeFilePath + "patches" + id;
FileHelper.outputToFile(patchesSourceCodeFile, FileHelper.readFile(patchesSourceCode), true);
String sizes = editScriptSizesFile + "sizes" + id;
FileHelper.outputToFile(editScriptSizesFilePath, FileHelper.readFile(sizes), true);
String buggyTokens = buggyTokensFilePath + "tokens" + id;
FileHelper.outputToFile(buggyTokensFile, FileHelper.readFile(buggyTokens), true);
}
// Select data by the size of edit script vectors.
List<Integer> sizesList;
try {
sizesList = MaxSizeSelector.readSizes(editScriptSizesFile);
int maxSize = MaxSizeSelector.selectMaxSize(MaxSizeType.ThirdQuartile, sizesList);
List<Integer> outlierIndexes = new ArrayList<>();
for (int i = 0, size = sizesList.size(); i < size; i ++) {
if (sizesList.get(i) > maxSize) {
outlierIndexes.add(i);
}
}
FileHelper.outputToFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS, "" + maxSize, false);
selectData(editScriptsFile, outlierIndexes, Configuration.SELECTED_EDITSCRIPTES_FILE);
selectData(patchesSourceCodeFile, outlierIndexes, Configuration.PATCH_SIGNAL, Configuration.SELECTED_PATCHES_SOURE_CODE_FILE);
int maxTokenVectorSize = selectDataOfSourceCodeTokens(buggyTokensFile, outlierIndexes, Configuration.SELECTED_BUGGY_TOKEN_FILE);
FileHelper.outputToFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE, "" + maxTokenVectorSize, false);
} catch (IOException e) {
e.printStackTrace();
}
}
private static void selectData(String intputFile, List<Integer> outlierIndexList, String outputFile) {
List<Integer> outlierIndexes = new ArrayList<>();
outlierIndexes.addAll(outlierIndexList);
FileInputStream fis = null;
Scanner scanner = null;
try {
fis = new FileInputStream(intputFile);
scanner = new Scanner(fis);
int index = 0;
StringBuilder builder = new StringBuilder();
int counter = 0;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (outlierIndexes.contains(index)) {
outlierIndexes.remove(new Integer(index));
} else {
builder.append(line + "\n");
if (++ counter % 100000 == 0) {
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
}
index ++;
}
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
if (scanner != null) {
scanner.close();
scanner = null;
}
if (fis != null) {
fis.close();
fis = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static void selectData(String inputFile, List<Integer> outlierIndexes, String startingSignal, String outputFile) {
FileInputStream fis = null;
Scanner scanner = null;
try {
fis = new FileInputStream(inputFile);
scanner = new Scanner(fis);
int index = -1;
StringBuilder builder = new StringBuilder();
int counter = 0;
String singleEntity = "";
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (line.equals(startingSignal)) {
if (!"".equals(singleEntity)) {
if (outlierIndexes.contains(index)) {
outlierIndexes.remove(new Integer(index));
} else {
builder.append(singleEntity + "\n");
if (++ counter % 100000 == 0) {
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
}
singleEntity = "";
}
index ++;
}
singleEntity += line + "\n";
}
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
if (scanner != null) {
scanner.close();
scanner = null;
}
if (fis != null) {
fis.close();
fis = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static int selectDataOfSourceCodeTokens(String inputFile, List<Integer> outlierIndexList, String outputFile) {
List<Integer> outlierIndexes = new ArrayList<>();
outlierIndexes.addAll(outlierIndexList);
FileInputStream fis = null;
Scanner scanner = null;
int size = 0;
try {
fis = new FileInputStream(inputFile);
scanner = new Scanner(fis);
int index = 0;
StringBuilder builder = new StringBuilder();
int counter = 0;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
if (outlierIndexes.contains(index)) {
outlierIndexes.remove(new Integer(index));
} else {
builder.append(line + "\n");
if (++ counter % 100000 == 0) {
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
String[] tokens = line.split(" ");
if (tokens.length > size) size = tokens.length;
}
index ++;
}
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
if (scanner != null) {
scanner.close();
scanner = null;
}
if (fis != null) {
fis.close();
fis = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
return size;
}
/**
* Prepare data for feature learning.
*/
public static void prepareDataForFeatureLearning() {
String zeroVector = "";
for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1 - 1; i < length; i ++) {
zeroVector += "0, ";
}
zeroVector += "0";
int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS).trim());
String embeddedTokensFile = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS;
Map<String, String> embeddedTokens = readEmbeddedTokens(embeddedTokensFile);
String editScriptsFile = Configuration.SELECTED_EDITSCRIPTES_FILE;
String outputFile = Configuration.VECTORIED_EDIT_SCRIPTS;
dataPrepare(editScriptsFile, maxSize, outputFile, embeddedTokens, zeroVector);
}
private static Map<String, String> readEmbeddedTokens(String embeddedTokensFile) {
Map<String, String> embeddedTokens = new HashMap<>();
File file = new File(embeddedTokensFile);
FileInputStream fis = null;
Scanner scanner = null;
try {
fis = new FileInputStream(file);
scanner = new Scanner(fis);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
int firstBlankIndex = line.indexOf(" ");
String token = line.substring(0, firstBlankIndex);
String value = line.substring(firstBlankIndex + 1).replaceAll(" ", ", ");
embeddedTokens.put(token, value);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
scanner.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return embeddedTokens;
}
private static void dataPrepare(String inputFile, int maxSize, String outputFile, Map<String, String> embeddedTokens, String zeroVector) {
File file = new File(inputFile);
FileInputStream fis = null;
Scanner scanner = null;
StringBuilder builder = new StringBuilder();
int counter = 0;
try {
fis = new FileInputStream(file);
scanner = new Scanner(fis);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
StringBuilder vectorStr = convertToVector(embeddedTokens, line, maxSize, zeroVector);
builder.append(vectorStr);
if (++ counter % 10000 == 0) {
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
scanner.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
private static StringBuilder convertToVector(Map<String, String> embeddedTokens, String line, int maxSize, String zeroVector) {
String[] tokens = line.split(" ");
StringBuilder vectorStr = new StringBuilder();
int length = tokens.length;
if (length == maxSize) {
for (int i = 0; i < length - 1; i ++) {
String token = tokens[i];
vectorStr.append(embeddedTokens.get(token) + ", ");
}
vectorStr.append(embeddedTokens.get(tokens[length - 1]) + "\n");
} else {
for (int i = 0; i < length; i ++) {
String token = tokens[i];
vectorStr.append(embeddedTokens.get(token) + ", ");
}
for (int i = length; i < maxSize - 1; i ++) {
vectorStr.append(zeroVector + ", ");
}
vectorStr.append(zeroVector + "\n");
}
return vectorStr;
}
/**
* Prepare data for clustering.
*/
public static void prepareDataForClustering() {
String featureFile = Configuration.EXTRACTED_FEATURES + "vectorizedEditScripts.csv";
String arffFile = Configuration.CLUSTER_INPUT;
DataPreparer.prepareData(featureFile, arffFile);
}
/**
* Read cluster results.
*/
public static List<Integer> readClusterResults() {
List<Integer> clusterResults = new ArrayList<>();
String clusterResultsFile = Configuration.CLUSTER_OUTPUT;
String results = FileHelper.readFile(clusterResultsFile);
BufferedReader reader = null;
try {
reader = new BufferedReader(new StringReader(results));
String line = null;
while ((line = reader.readLine()) != null) {
clusterResults.add(Integer.parseInt(line));
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return clusterResults;
}
public static Map<Integer, List<Integer>> readClusterResult(List<Integer> clusterResults) {
Map<Integer, List<Integer>> clusters = new HashMap<>();
for (int i = 0, size = clusterResults.size(); i < size; i ++) {
int clusterNo = clusterResults.get(i);
if (clusters.containsKey(clusterNo)) {
clusters.get(clusterNo).add(i + 1);
} else {
List<Integer> newCLuster = new ArrayList<>();
newCLuster.add(i + 1);
clusters.put(clusterNo, newCLuster);
}
}
return clusters;
}
/**
* Data for un-supervised learning.
*/
public static void prepareTokensForEvaluation1() {
String outputFile = Configuration.EMBEDDING_DATA_TOKENS1;
FileHelper.outputToFile(outputFile, FileHelper.readFile(Configuration.SELECTED_BUGGY_TOKEN_FILE), false);
List<File> files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list");
for (File file : files) {
FileHelper.outputToFile(outputFile, FileHelper.readFile(file), true);
}
}
public static void prepareDataForFeatureLearningOfEvaluation1() {
String zeroVector = "";
for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2 - 1; i < length; i ++) {
zeroVector += "0, ";
}
zeroVector += "0";
int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE));
String allEmbeddedTokens = Configuration.EMBEDDED_ALL_TOKENS1;
Map<String, String> embeddedTokens = readEmbeddedTokens(allEmbeddedTokens);
// Testing data
String clusteredTokens = Configuration.TEST_DATA_FILE;
List<File> files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list");
for (File file : files) {
}
String allTokensOfSourceCode = Configuration.EMBEDDING_DATA_TOKENS1; // TODO testing data should be separated.
dataPrepare(allTokensOfSourceCode, maxSize, Configuration.VECTORIED_ALL_SOURCE_CODE1, embeddedTokens, zeroVector);
}
/**
* Data for supervised learning.
*/
public static void prepareTokensForEvaluation2(Map<Integer, Integer> commonClustersMappingLabel) {
String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE;
String outputFile = Configuration.EMBEDDING_DATA_TOKENS2;
List<File> files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list");
for (File file : files) {
String fileName = file.getName();
String clusterNumStr = fileName.substring(fileName.lastIndexOf("_") + 1, fileName.lastIndexOf(".list"));
int clusterNum = Integer.parseInt(clusterNumStr);
if (commonClustersMappingLabel.containsKey(clusterNum)) {
String content = FileHelper.readFile(file);
FileHelper.outputToFile(outputFile, content, true);
}
}
files.clear();
files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list");
for (File file : files) {
FileHelper.outputToFile(outputFile, FileHelper.readFile(file), true);
}
}
public static void prepareDataForFeatureLearningOfEvaluation2(Map<Integer, Integer> commonClustersMappingLabel) {
String zeroVector = "";
for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2 - 1; i < length; i ++) {
zeroVector += "0, ";
}
zeroVector += "0";
String allEmbeddedTokensOfEvaluation = Configuration.EMBEDDED_ALL_TOKENS2;
Map<String, String> embeddedTokens = readEmbeddedTokens(allEmbeddedTokensOfEvaluation);
int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE));
// Training data
String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE;
List<File> files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list");
for (File file : files) {
String fileName = file.getName();
String clusterNumStr = fileName.substring(fileName.lastIndexOf("_") + 1, fileName.lastIndexOf(".list"));
int clusterNum = Integer.parseInt(clusterNumStr);
if (commonClustersMappingLabel.containsKey(clusterNum)) {
dataPrepare(file.getPath(), maxSize, Configuration.TRAINING_DATA, embeddedTokens, zeroVector, clusterNum);
}
}
// Testing data
files.clear();
String testingData = Configuration.TEST_DATA_FILE;
files = FileHelper.getAllFilesInCurrentDiectory(testingData, ".list");
String testingDataPath = Configuration.TESTING_DATA;
for (File file : files) {
String fileName = file.getName();
fileName.replace(".list", ".csv");
dataPrepare(file.getPath(), maxSize, testingDataPath + fileName, embeddedTokens, zeroVector, 0);
}
}
private static void dataPrepare(String inputFile, int maxSize, String outputFile, Map<String, String> embeddedTokens,
String zeroVector, int clusterNum) {
FileInputStream fis = null;
Scanner scanner = null;
StringBuilder builder = new StringBuilder();
int counter = 0;
try {
fis = new FileInputStream(inputFile);
scanner = new Scanner(fis);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
StringBuilder vectorStr = convertToVector(embeddedTokens, line, maxSize, zeroVector, clusterNum);
builder.append(vectorStr);
if (++ counter % 10000 == 0) {
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
scanner.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
FileHelper.outputToFile(outputFile, builder, true);
builder.setLength(0);
}
private static StringBuilder convertToVector(Map<String, String> embeddedTokens, String line, int maxSize, String zeroVector, int clusterNum) {
String[] tokens = line.split(" ");
StringBuilder vectorStr = new StringBuilder();
int length = tokens.length;
if (length == maxSize) {
for (int i = 0; i < length; i ++) {
String token = tokens[i];
vectorStr.append(embeddedTokens.get(token) + ", ");
}
} else {
for (int i = 0; i < length; i ++) {
String token = tokens[i];
vectorStr.append(embeddedTokens.get(token) + ", ");
}
for (int i = length; i < maxSize; i ++) {
vectorStr.append(zeroVector + ", ");
}
}
vectorStr.append(clusterNum + "\n");
return vectorStr;
}
public static Map<Integer, Integer> readCommonCLusters() {
Map<Integer, Integer> commonClustersMappingLabel = new HashMap<>();
String commonClusters = FileHelper.readFile(Configuration.CLUSTERNUMBER_LABEL_MAP);
BufferedReader reader = null;
try {
reader = new BufferedReader(new StringReader(commonClusters));
String line = reader.readLine();
while ((line = reader.readLine()) != null) {
String[] strArray = line.split(" : ");
int key = Integer.parseInt(strArray[1]);
int value = Integer.parseInt(strArray[0]);
commonClustersMappingLabel.put(key, value);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return commonClustersMappingLabel;
}
}
@@ -0,0 +1,66 @@
package edu.lu.uni.serval.FixPatternMining.DataPrepare;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import edu.lu.uni.serval.utils.FileHelper;
import edu.lu.uni.serval.utils.ListSorter;
public class MaxSizeSelector {
public enum MaxSizeType {
UpperWhisker, ThirdQuartile
}
public static List<Integer> readSizes(String sizeFilePath) throws IOException {
List<Integer> sizes = new ArrayList<>();
String sizesStr = FileHelper.readFile(sizeFilePath);
BufferedReader br = new BufferedReader(new StringReader(sizesStr));
String line = null;
while ((line = br.readLine()) != null) {
sizes.add(Integer.parseInt(line.trim()));
}
return sizes;
}
public static int selectMaxSize(MaxSizeType maxSizeType, List<Integer> sizesDistribution) {
int maxSize = 0;
switch (maxSizeType) {
case UpperWhisker:
maxSize = upperWhisker(sizesDistribution);
break;
case ThirdQuartile:
maxSize = thirdQuarter(sizesDistribution);
break;
}
return maxSize;
}
private static int upperWhisker(List<Integer> sizesDistribution) {
List<Integer> sizes = new ArrayList<>();
sizes.addAll(sizesDistribution);
ListSorter<Integer> sorter = new ListSorter<Integer>(sizes);
sizesDistribution = sorter.sortAscending();
int firstQuarterIndex = sizesDistribution.size() * 25 / 100;
int firstQuarter = sizesDistribution.get(firstQuarterIndex);
int thirdQuarterIndex = sizesDistribution.size() * 75 / 100;
int thirdQuarter = sizesDistribution.get(thirdQuarterIndex);
int upperWhisker = thirdQuarter + (int) (1.5 * (thirdQuarter - firstQuarter));
return upperWhisker;
}
private static int thirdQuarter(List<Integer> sizesDistribution) {
List<Integer> sizes = new ArrayList<>();
sizes.addAll(sizesDistribution);
ListSorter<Integer> sorter = new ListSorter<Integer>(sizes);
sizesDistribution = sorter.sortAscending();
int thirdQuarterIndex = sizesDistribution.size() * 75 / 100;
int thirdQuarter = sizesDistribution.get(thirdQuarterIndex);
return thirdQuarter;
}
}
@@ -0,0 +1,121 @@
package edu.lu.uni.serval.FixPatternMining;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.deeplearner.CNNFeatureExtractor2;
import edu.lu.uni.serval.deeplearner.CNNSupervisedLearning;
import edu.lu.uni.serval.utils.FileHelper;
public class FeatureLearner {
/**
* Learn features of edit scripts for fix patterns mining.
*/
public void learnFeatures() {
String editScriptsVectorFile = Configuration.VECTORIED_EDIT_SCRIPTS; // input
int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS).trim());
int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1;
int batchSize = 1000;
int sizeOfFeatureVector = 200;
try {
CNNFeatureExtractor2 learner = new CNNFeatureExtractor2(new File(editScriptsVectorFile), sizeOfVector, sizeOfTokenVec, batchSize, sizeOfFeatureVector);
learner.setNumberOfEpochs(20);
learner.setSeed(123);
learner.setNumOfOutOfLayer1(20);
learner.setNumOfOutOfLayer2(50);
learner.setOutputPath(Configuration.EXTRACTED_FEATURES);
learner.extracteFeaturesWithCNN();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public void learnFeaturesOfSourceCode() {
int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE));
int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2;
int batchSize = 1000;
int sizeOfExtractedFeatureVector = 200;
try {
CNNFeatureExtractor2 learner = new CNNFeatureExtractor2(new File(Configuration.VECTORIED_ALL_SOURCE_CODE1), sizeOfVector, sizeOfTokenVec, batchSize, sizeOfExtractedFeatureVector);
learner.setNumberOfEpochs(20);
learner.setSeed(123);
learner.setNumOfOutOfLayer1(20);
learner.setNumOfOutOfLayer2(50);
learner.setOutputPath(Configuration.EXTRACTED_FEATURES_EVALUATION);
learner.extracteFeaturesWithCNN();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* Supervised learning.
*/
public void learnFeaturesOfSourceCode2(File testingData) {
int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE));
int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2;
int batchSize = 1000;
int sizeOfExtractedFeatureVector = 200;
try {
int clusterNum = DataPreparation.readCommonCLusters().size();
File trainingData = new File(Configuration.TRAINING_DATA);
CNNSupervisedLearning learner = new CNNSupervisedLearning(trainingData, sizeOfVector,
sizeOfTokenVec, batchSize, sizeOfExtractedFeatureVector, clusterNum, testingData);
learner.setNumberOfEpochs(20);
learner.setSeed(123);
learner.setNumOfOutOfLayer1(20);
learner.setNumOfOutOfLayer2(50);
learner.setOutputPath(Configuration.FEATURES_OF_TRAINING_DATA);
learner.setFeatresOfTestingData(Configuration.FEATURES_OF_TESTING_DATA);
learner.setPossibilitiesOfPrediction(Configuration.POSSIBILITIES_OF_TESTING_DATA);
learner.setPredictedResultsOfTestingData(Configuration.PREDICTED_RESULTS_OF_TESTING_DATA);
learner.setModelFile(Configuration.SUPERVISED_LEARNING_MODEL);
learner.extracteFeaturesWithCNN();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* Supervised learning by loading a model.
*/
public void learnFeaturesOfSourceCode3(File testingData) {
int batchSize = 1000;
try {
String modelFile = Configuration.SUPERVISED_LEARNING_MODEL;
CNNSupervisedLearning learner = new CNNSupervisedLearning(batchSize, testingData, modelFile);
learner.setFeatresOfTestingData(Configuration.FEATURES_OF_TESTING_DATA);
learner.setPossibilitiesOfPrediction(Configuration.POSSIBILITIES_OF_TESTING_DATA);
learner.setPredictedResultsOfTestingData(Configuration.PREDICTED_RESULTS_OF_TESTING_DATA);
learner.extracteFeaturesWithCNNByLoadingModel();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
@@ -0,0 +1,65 @@
package edu.lu.uni.serval.FixPatternMining;
import java.io.File;
import java.io.IOException;
import edu.lu.uni.serval.config.Configuration;
import edu.lu.uni.serval.deeplearner.Word2VecEncoder;
/**
* Encode tokens of edit scripts with Word2Vec.
*
* @author kui.liu
*
*/
public class TokenEmbedder {
/**
* Embed tokens for fix patterns mining.
*/
public void embedTokensOfEditScripts() {
Word2VecEncoder encoder = new Word2VecEncoder();
int windowSize = 2;
encoder.setWindowSize(windowSize);
try {
File inputFile = new File(Configuration.SELECTED_EDITSCRIPTES_FILE);
int minWordFrequency = 1;
int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1;
String outputFileName = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS;
encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName);
} catch (IOException e) {
e.printStackTrace();
}
}
public void embedTokensOfSourceCodeForSupervisedTesting() {
Word2VecEncoder encoder = new Word2VecEncoder();
int windowSize = 2;
encoder.setWindowSize(windowSize);
try {
File inputFile = new File(Configuration.EMBEDDING_DATA_TOKENS2);
int minWordFrequency = 1;
int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2;
String outputFileName = Configuration.EMBEDDED_ALL_TOKENS2;
encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName);
} catch (IOException e) {
e.printStackTrace();
}
}
public void embedTokensOfSourceCodeForUnsupervisedTesting() {
Word2VecEncoder encoder = new Word2VecEncoder();
int windowSize = 2;
encoder.setWindowSize(windowSize);
try {
File inputFile = new File(Configuration.EMBEDDING_DATA_TOKENS1);
int minWordFrequency = 1;
int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2;
String outputFileName = Configuration.EMBEDDED_ALL_TOKENS1;
encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName);
} catch (IOException e) {
e.printStackTrace();
}
}
}
@@ -24,13 +24,13 @@ public class Configuration {
public static final String BUGGY_CODY_TOKENS_FILE = GUM_TREE_OUTPUT + "tokens.list";
public static final String EDITSCRIPT_SIZES_FILE = GUM_TREE_OUTPUT + "editScriptSizes.list";
public static int MAX_EDIT_SCRIPT_VECTOR_SIZE = 0; // The max size of edit script vectors.
public static int MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE = 0; // The max size of all buggy source code token vectors.
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN1 = 100; // tokens of edit scripts.
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN2 = 200; // tokens of source code
// the input path of fix patterns mining.
private static final String MINING_INPUT = ROOT_PATH + "MiningInput/";
public static final String MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS = MINING_INPUT + "/MaxTokenVectorSizeOfEditScripts.list"; // The max size of edit scripts: upper limitation of max size.
public static final String MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE = MINING_INPUT + "/MaxTokenVectorSizeOfBuggySourceCode.list"; // The max size of all buggy source code token vectors.
// the input path of token embedding.
public static final String EMBEDDING_INPUT = MINING_INPUT + "Embedding/";
public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";// Selected patches.
@@ -42,7 +42,7 @@ public class Configuration {
public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list"; // All embedded tokens of selected edit scripts.
public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv"; // Embedded and vectorized edit script vectors.
// the input path of clustering.
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of edit scripts.
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of all edit scripts.
public static final String CLUSTER_INPUT = MINING_INPUT + "ClusteringInput/input.arff";
// the output path of fix patterns mining.
@@ -53,20 +53,28 @@ public class Configuration {
// evaluation data
public static final String TEST_INPUT = ROOT_PATH + "TestProjects/";
public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list"; // Positions of all test statements.
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list"; // Token vectors of all test statements.
public static final String TEST_POSITION_FILE = ROOT_PATH + "TestData/Positions/"; // Positions of all test statements.
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements/"; // Token vectors of all test statements.
public static final String NUMBER_OF_TRAINING_DATA = ROOT_PATH + "TestData/NumberOfTrainingData.list";;
// data of unsupervised learning
public static final String EMBEDDING_DATA_TOKENS1 = ROOT_PATH + "TestData/AllTokenVectorsForEvaluation.list";
public static final String EMBEDDED_ALL_TOKENS1 = ROOT_PATH + "TestData/AllEmbeddedTokens.list";
public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode.list";
public static final String EXTRACTED_FEATURES_TESTING = ROOT_PATH + "TestDataExtractedFeatures/";
public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode/";
public static final String EXTRACTED_FEATURES_EVALUATION = ROOT_PATH + "TestDataExtractedFeatures/"; // extracted features of all source code (training data and testing data)
// Data of supervised learning
public static final String CLUSTERNUMBER_LABEL_MAP = ROOT_PATH + "TestData/clusterMappingLabel.list";
public static final String EMBEDDING_DATA_TOKENS2 = ROOT_PATH + "TestData/AllTokenVectorsForSupervisedEvaluation.list";
public static final String EMBEDDED_ALL_TOKENS2 = ROOT_PATH + "TestData/AllEmbeddedTokensForSuperVisedEvaluation.list";
public static final String TRAINING_DATA = ROOT_PATH + "TestData/TrainingData.csv"; // Training data of supervised learning
public static final String TESTING_DATA = ROOT_PATH + "TestData/TestingData.csv"; // testing data of supervised learning
public static final String TESTING_DATA = ROOT_PATH + "TestData/SupervisedLearning/"; // testing data of supervised learning
public static final String FEATURES_OF_TRAINING_DATA = ROOT_PATH + "TestingOutput/TraingFeatures/";
public static final String FEATURES_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/TestingFeatures/";
public static final String POSSIBILITIES_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/Posibilities/";
public static final String PREDICTED_RESULTS_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/Prediction/";
public static final String SUPERVISED_LEARNING_MODEL = ROOT_PATH + "TestingOutput/SupervisedLearningModel.zip";
}
@@ -28,23 +28,17 @@ import edu.lu.uni.serval.utils.FileHelper;
*/
public class ProjectScanner {
public static void main(String[] args) {
String inputPath = Configuration.TEST_INPUT; //test java projects
File inputFileDirector = new File(inputPath);
File[] projects = inputFileDirector.listFiles(); // project folders
String outputLocalizeFile = Configuration.TEST_LOCALIZATION_FILE;
String outputTokensFile = Configuration.TEST_DATA_FILE;
private int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE));
private int numberOfFiles = 0;
private List<SimpleTree> allSimpleTrees = new ArrayList<>();
public void scanJavaProject(File[] projects, String outputLocalizeFile, String outputTokensFile, int limitation) {
for (File project : projects) {
ProjectScanner scanner = new ProjectScanner();
scanner.scanJavaProject(project, outputLocalizeFile, outputTokensFile);
scanJavaProject(project, outputLocalizeFile, outputTokensFile, limitation);
}
}
List<SimpleTree> allSimpleTrees = new ArrayList<>();
public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile) {
public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile, int limitation) {
List<File> files = new ArrayList<>();
files.addAll(FileHelper.getAllFiles(javaProject.getPath(), ".java"));
@@ -60,20 +54,25 @@ public class ProjectScanner {
CUCreator cuCreator = new CUCreator();
CompilationUnit cUnit = cuCreator.createCompilationUnit(file);
getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject.getPath(), file.getPath());
++ counter;
if (++ counter % 1000 == 0) {
FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true);
FileHelper.outputToFile(outputTokensFile, tokensBuilder, true);
if ( counter % limitation == 0) {
numberOfFiles ++;
FileHelper.outputToFile(outputLocalizeFile + "Positions" + numberOfFiles + ".list", localizationsBuilder, true);
FileHelper.outputToFile(outputTokensFile + "Tokens" + numberOfFiles + ".list", tokensBuilder, true);
localizationsBuilder.setLength(0);
tokensBuilder.setLength(0);
}
}
FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true);
FileHelper.outputToFile(outputTokensFile, tokensBuilder, true);
if (localizationsBuilder.length() > 0) {
numberOfFiles ++;
FileHelper.outputToFile(outputLocalizeFile + "Positions" + numberOfFiles + ".list", localizationsBuilder, true);
FileHelper.outputToFile(outputTokensFile + "Tokens" + numberOfFiles + ".list", tokensBuilder, true);
localizationsBuilder.setLength(0);
tokensBuilder.setLength(0);
}
}
private void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
String astNodeType = ASTNodeMap.map.get(tree.getType()); //ignore: SwitchCase, SuperConstructorInvocation, ConstructorInvocation
@@ -98,7 +97,8 @@ public class ProjectScanner {
// project name: file name: line number
String tokens = Tokenizer.getTokensDeepFirst(simpleTree).trim();
String[] tokensArray = tokens.split(" ");
if (tokensArray.length <= Configuration.MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE) {
if (tokensArray.length <= maxSize) {
int position = tree.getPos();
int lineNum = unit.getLineNumber(position);
tokensBuilder.append(tokens).append("\n");
@@ -183,4 +183,5 @@ public class ProjectScanner {
simpleTree.setParent(parent);
return simpleTree;
}
}