From baf6b06eed575b987e02d9a069cedb4cc9de4788 Mon Sep 17 00:00:00 2001 From: Kui LIU Date: Wed, 2 Aug 2017 23:49:27 +0200 Subject: [PATCH] Add deep learning models. --- pom.xml | 12 + .../serval/FixPattern/info/FixPattern.java | 24 + .../FixPattern/info/GumTreeAnalysis.java | 163 +++++ .../serval/FixPatternMining/App/Step1.java | 35 ++ .../serval/FixPatternMining/App/Step10.java | 33 ++ .../serval/FixPatternMining/App/Step11.java | 37 ++ .../serval/FixPatternMining/App/Step12.java | 40 ++ .../serval/FixPatternMining/App/Step13.java | 33 ++ .../serval/FixPatternMining/App/Step2.java | 25 + .../serval/FixPatternMining/App/Step3.java | 24 + .../serval/FixPatternMining/App/Step4.java | 25 + .../serval/FixPatternMining/App/Step5.java | 24 + .../serval/FixPatternMining/App/Step6.java | 23 + .../serval/FixPatternMining/App/Step7.java | 47 ++ .../serval/FixPatternMining/App/Step8.java | 33 ++ .../serval/FixPatternMining/App/Step9.java | 33 ++ .../uni/serval/FixPatternMining/Cluster.java | 45 ++ .../FixPatternMining/ClusterAnalyser.java | 146 +++++ .../FixPatternMining/ClusterResults.java | 35 ++ .../FixPatternMining/CommonPatterns.java | 72 +++ .../DataPrepare/DataPreparation.java | 560 ++++++++++++++++++ .../DataPrepare/MaxSizeSelector.java | 66 +++ .../FixPatternMining/FeatureLearner.java | 121 ++++ .../FixPatternMining/TokenEmbedder.java | 65 ++ .../lu/uni/serval/config/Configuration.java | 26 +- .../uni/serval/evaluation/ProjectScanner.java | 45 +- 26 files changed, 1761 insertions(+), 31 deletions(-) create mode 100644 src/main/java/edu/lu/uni/serval/FixPattern/info/FixPattern.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPattern/info/GumTreeAnalysis.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step1.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step10.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step11.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step12.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step13.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step2.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step3.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step4.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step5.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step6.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step7.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step8.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step9.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/Cluster.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterAnalyser.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterResults.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/CommonPatterns.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/DataPreparation.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/MaxSizeSelector.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/FeatureLearner.java create mode 100644 src/main/java/edu/lu/uni/serval/FixPatternMining/TokenEmbedder.java diff --git a/pom.xml b/pom.xml index 748abab..27101a1 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,18 @@ + + edu.lu.uni.serval + MyCluster + 0.0.1-SNAPSHOT + + + + edu.lu.uni.serval + MyFeatureLearner + 0.0.1-SNAPSHOT + + edu.lu.uni simple-utils diff --git a/src/main/java/edu/lu/uni/serval/FixPattern/info/FixPattern.java b/src/main/java/edu/lu/uni/serval/FixPattern/info/FixPattern.java new file mode 100644 index 0000000..2e9ae88 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPattern/info/FixPattern.java @@ -0,0 +1,24 @@ +package edu.lu.uni.serval.FixPattern.info; + +import edu.lu.uni.serval.gumtree.regroup.HierarchicalActionSet; +import edu.lu.uni.serval.gumtree.regroup.SimpleTree; + +public class FixPattern { + private SimpleTree buggyCodeTree; // it will be used to compute the similarity. + private HierarchicalActionSet editScripts; // it will be used to generate new patches. + + public SimpleTree getBuggyCodeTree() { + return buggyCodeTree; + } + + public HierarchicalActionSet getEditScripts() { + return editScripts; + } + + public FixPattern(SimpleTree buggyCodeTree, HierarchicalActionSet editScripts) { + super(); + this.buggyCodeTree = buggyCodeTree; + this.editScripts = editScripts; + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPattern/info/GumTreeAnalysis.java b/src/main/java/edu/lu/uni/serval/FixPattern/info/GumTreeAnalysis.java new file mode 100644 index 0000000..30eaea5 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPattern/info/GumTreeAnalysis.java @@ -0,0 +1,163 @@ +package edu.lu.uni.serval.FixPattern.info; + +//import java.io.File; +//import java.io.IOException; +//import java.util.ArrayList; +//import java.util.HashMap; +//import java.util.List; +//import java.util.Map; + +import org.eclipse.jdt.core.dom.ASTParser; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; + +//import com.github.gumtreediff.actions.ActionGenerator; +//import com.github.gumtreediff.actions.model.Action; +//import com.github.gumtreediff.gen.jdt.JdtTreeGenerator; +//import com.github.gumtreediff.gen.jdt.cd.CdJdtTreeGenerator; +//import com.github.gumtreediff.matchers.Matcher; +//import com.github.gumtreediff.matchers.Matchers; +import com.github.gumtreediff.tree.ITree; +import com.github.gumtreediff.tree.TreeContext; + +import edu.lu.uni.serval.FixPattern.utils.ASTNodeMap; +import edu.lu.uni.serval.gen.jdt.exp.ExpJdtTreeGenerator; + +@Deprecated +public class GumTreeAnalysis { + + +// private static void analyzeBugFixes(String gitRepoPath, String outputPath) { +// log.info("Repo: " + gitRepoPath); +// +// GitTraveller gitTraveller = new GitTraveller(gitRepoPath, outputPath); +// gitTraveller.travelGitRepo(); +//// Map> commitFiles = gitTraveller.getCommitFiles(); +// +// List allDiffEntries = gitTraveller.getAllDiffEntries(); +// String previousFilesPath = gitTraveller.getPreviousFilesPath(); +// String revisedFilesPath = gitTraveller.getRevisedFilesPath(); +// for (MyDiffEntry diff : allDiffEntries) { +// String fileA = previousFilesPath + diff.getPrevFile(); +// String fileB = revisedFilesPath + diff.getRevFile(); +// List gumTreeResults = GumTreeAnalysis.compareTwoFilesWithGumTree(fileA, fileB); +// if (gumTreeResults.size() == 0) { +// continue; +// } +// StringBuilder builder = new StringBuilder(); +// builder.append("Previous File: " + fileA + "\n"); +// builder.append("Revised File: " + fileB + "\n"); +// String diffs = ""; +// for (ModifiedDetails md : diff.getModifiedDetails()) { +// diffs += md.getLineNumber() + "\n"; +// diffs += md.getFragment() + "\n"; +// } +// builder.append("DiffEntry: " + diffs); +// for (String gumTreeResult : gumTreeResults) { +// builder.append(gumTreeResult.toString() + "\n"); +// } +// FileHelper.outputToFile("OUTPUT/GumTreeResults/" + FileHelper.getRepositoryName(gitRepoPath) + "/" + diff.getRevFile().replace(".java", ".txt"), builder, false); +// } +// +//// DiffEntryParser diffEntryParser = new DiffEntryParser(allDiffEntries); +//// diffEntryParser.parseDiffEntries(); +//// +//// // : String ==> revisedFileName. +//// Map> parsedDiffEntries = diffEntryParser.getParsedDiffEntries(); +//// diffEntryParser = null; +//// allDiffEntries = null; +//// +//// gitTraveller = null; +//// +//// for (Map.Entry> entry : parsedDiffEntries.entrySet()) { +//// String revisedFileName = entry.getKey(); +//// String fileA = previousFilesPath + "prev_" + revisedFileName; +//// String fileB = revisedFilesPath + revisedFileName; +//// System.err.println("FileName" + fileA); +//// List gumTreeResults = GumTreeAnalysis.compareTwoFilesWithGumTree(fileA, fileB); +//// StringBuilder builder = new StringBuilder(); +//// builder.append("Previous File: " + fileA + "\n"); +//// builder.append("Revised File: " + fileB + "\n"); +//// builder.append("DiffEntry: "); +//// for (String gumTreeResult : gumTreeResults) { +//// builder.append(gumTreeResult + "\n"); +//// } +//// FileHelper.outputToFile("OUTPUT/GumTreeResults/" + FileHelper.getRepositoryName(gitRepoPath) + "/" + revisedFileName.replace(".java", ".txt"), builder, false); +//// } +// +// } +// +// public static List compareTwoFilesWithGumTree(String prevFile, String revFile) { +// List gumTreeResults = new ArrayList(); +// +// try { +//// TreeContext tc1 = new ExpJdtTreeGenerator().generateFromFile(prevFile); +//// TreeContext tc2 = new ExpJdtTreeGenerator().generateFromFile(revFile); +//// TreeContext tc1 = new JdtTreeGenerator().generateFromFile(prevFile); +//// TreeContext tc2 = new JdtTreeGenerator().generateFromFile(revFile); +// TreeContext tc1 = new RowTokenJdtTreeGenerator().generateFromFile(prevFile); +// TreeContext tc2 = new RowTokenJdtTreeGenerator().generateFromFile(revFile); +//// TreeContext tc1 = new CdJdtTreeGenerator().generateFromFile(prevFile); +//// TreeContext tc2 = new CdJdtTreeGenerator().generateFromFile(revFile); +// ITree t1 = tc1.getRoot(); +// ITree t2 = tc2.getRoot(); +// +// Matcher m = Matchers.getInstance().getMatcher(t1, t2); +// m.match(); +// +// ActionGenerator ag = new ActionGenerator(t1, t2, m.getMappings()); +// ag.generate(); +// +// List actions = ag.getActions(); +// for(Action ac : actions){ +// String actionStr = parseAction(ac.toString()); +// gumTreeResults.add(actionStr); +// } +// +// } catch (IOException e) { +// e.printStackTrace(); +// } +// return gumTreeResults; +// } + +// private static String parseAction(String actStr) { +// // UPD 25@@!a from !a to isTrue(a) at 69 +// String[] actStrArrays = actStr.split("@@"); +// actStr = ""; +// int length = actStrArrays.length; +// for (int i = 0; i < length - 1; i++) { +// String actStrFrag = actStrArrays[i]; +// int index = actStrFrag.lastIndexOf(" ") + 1; +// String nodeType = actStrFrag.substring(index); +// String backup = nodeType; +// try { +// nodeType = ASTNodeMap.map.get(Integer.parseInt(nodeType)); +// } catch (NumberFormatException e) { +// nodeType = backup; +// log.info(actStr); +// } +// actStrFrag = actStrFrag.substring(0, index) + nodeType + "@@"; +// actStr += actStrFrag; +// } +// actStr += actStrArrays[length - 1]; +// return actStr; +// } + + private static String parseAction(String actStr) { + // UPD 25@@!a from !a to isTrue(a) at 69 + String[] actStrArrays = actStr.split("@@"); + actStr = ""; + int length = actStrArrays.length; + for (int i =0; i < length - 1; i ++) { + String actStrFrag = actStrArrays[i]; + int index = actStrFrag.lastIndexOf(" ") + 1; + String nodeType = actStrFrag.substring(index); + nodeType = ASTNodeMap.map.get(Integer.parseInt(nodeType)); + actStrFrag = actStrFrag.substring(0, index) + nodeType + "@@"; + actStr += actStrFrag; + } + actStr += actStrArrays[length - 1]; + return actStr; + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step1.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step1.java new file mode 100644 index 0000000..898e555 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step1.java @@ -0,0 +1,35 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for tokens embedding of edit scripts. + * + * Input data: parsed results of patches with GumTree. + * + * @author kui.liu + * + */ +public class Step1 { + public static void main(String[] args) { + String editScriptsFile = Configuration.EDITSCRIPT_SIZES_FILE; + String patchesSourceCodeFile = Configuration.PATCH_SOURCECODE_FILE; + String buggyTokensFile = Configuration.BUGGY_CODY_TOKENS_FILE; + String editScriptSizesFile = Configuration.EDITSCRIPT_SIZES_FILE; + FileHelper.deleteFile(editScriptsFile); + FileHelper.deleteFile(patchesSourceCodeFile); + FileHelper.deleteFile(buggyTokensFile); + FileHelper.deleteFile(editScriptSizesFile); + + String selectedEditScripts = Configuration.SELECTED_EDITSCRIPTES_FILE; + String selectedPatches = Configuration.SELECTED_PATCHES_SOURE_CODE_FILE; + String selectedBuggyTokens = Configuration.SELECTED_BUGGY_TOKEN_FILE; + FileHelper.deleteFile(selectedEditScripts); + FileHelper.deleteFile(selectedPatches); + FileHelper.deleteFile(selectedBuggyTokens); + + DataPreparation.prepareDataForTokenEmbedding(); + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step10.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step10.java new file mode 100644 index 0000000..a6b4132 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step10.java @@ -0,0 +1,33 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.TokenEmbedder; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for evaluation. + * + * Embed tokens of source code vectors of training data and testing data. + * + * @author kui.liu + * + */ +public class Step10 { + + public static void main(String[] args) { + boolean isSupervisedLearning = true; + if (isSupervisedLearning) {// supervised learning + String outputFileName = Configuration.EMBEDDED_ALL_TOKENS2; + FileHelper.deleteFile(outputFileName); + // Data pre-processing + TokenEmbedder embedder2 = new TokenEmbedder(); + embedder2.embedTokensOfSourceCodeForSupervisedTesting(); + } else { // un-supervised learning + String outputFileName = Configuration.EMBEDDED_ALL_TOKENS1; + FileHelper.deleteFile(outputFileName); + // Data pre-processing + TokenEmbedder embedder2 = new TokenEmbedder(); + embedder2.embedTokensOfSourceCodeForUnsupervisedTesting(); + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step11.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step11.java new file mode 100644 index 0000000..229ed78 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step11.java @@ -0,0 +1,37 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.util.Map; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for evaluation. + * + * Vectorize data for deep learning. + * + * @author kui.liu + * + */ +public class Step11 { + + public static void main(String[] args) { + boolean isSupervisedLearning = true; + if (isSupervisedLearning) {// supervised learning + String trainingDataPath = Configuration.TRAINING_DATA; + FileHelper.deleteFile(trainingDataPath); + String testingDataPath = Configuration.TESTING_DATA; + FileHelper.deleteDirectory(testingDataPath); + + Map commonClustersMappingLabel = DataPreparation.readCommonCLusters(); + DataPreparation.prepareDataForFeatureLearningOfEvaluation2(commonClustersMappingLabel); + } else { // un-supervised learning + String outputData = Configuration.VECTORIED_ALL_SOURCE_CODE1; + FileHelper.deleteFile(outputData); + // Before embedding tokens. + // List files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list"); + DataPreparation.prepareDataForFeatureLearningOfEvaluation1(); + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step12.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step12.java new file mode 100644 index 0000000..7abe95e --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step12.java @@ -0,0 +1,40 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.io.File; +import java.util.List; + +import edu.lu.uni.serval.FixPatternMining.FeatureLearner; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Evaluation: extract features of testing data and predict their labels. + * + * @author kui.liu + * + */ +public class Step12 { + + public static void main(String[] args) { + boolean isSupervisedLearning = true; + if (isSupervisedLearning) {// supervised learning + List testingDataFiles = FileHelper.getAllFilesInCurrentDiectory(Configuration.TESTING_DATA, ".csv"); + for (int i = 0, size = testingDataFiles.size(); i < size; i ++) { + if (i == 0) { + // TODO: we can test this model by our clustered resutls. + FeatureLearner learner2 = new FeatureLearner(); + learner2.learnFeaturesOfSourceCode2(testingDataFiles.get(i)); + } else { + FeatureLearner learner2 = new FeatureLearner(); + learner2.learnFeaturesOfSourceCode3(testingDataFiles.get(i)); + } + } + } else { // un-supervised learning + + FeatureLearner learner2 = new FeatureLearner(); + learner2.learnFeaturesOfSourceCode(); + // Extracted Features: Configuration.EXTRACTED_FEATURES_TESTING; + // Compute the similarity: cosin similarity + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step13.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step13.java new file mode 100644 index 0000000..67e7a29 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step13.java @@ -0,0 +1,33 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.io.File; +import java.util.List; + +import edu.lu.uni.serval.FixPatternMining.FeatureLearner; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Evaluation: extract features of testing data and predict their labels. + * + * @author kui.liu + * + */ +public class Step13 { + + public static void main(String[] args) { + boolean isSupervisedLearning = true; + if (isSupervisedLearning) {// supervised learning + // label --> possibility --> 90, 80, 70, 60 others ignored, level one localization + // label: clusterNum, re-compute similarity with each element. 90, 80, 70, 60. + // similarity: patches --> fixing bug. + List testingDataFiles = FileHelper.getAllFilesInCurrentDiectory(Configuration.TESTING_DATA, ".csv"); + for (int i = 0, size = testingDataFiles.size(); i < size; i ++) { + } + } else { // un-supervised learning + + // Extracted Features: Configuration.EXTRACTED_FEATURES_TESTING; + // Compute the similarity: cosin similarity + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step2.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step2.java new file mode 100644 index 0000000..1ec7dcc --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step2.java @@ -0,0 +1,25 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.TokenEmbedder; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Embed tokens of all selected edit scripts. + * + * Input data: all tokens of selected edit scripts. + * + * @author kui.liu + * + */ +public class Step2 { + + public static void main(String[] args) { + String outputFileName = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS; + FileHelper.deleteFile(outputFileName); + + TokenEmbedder embedder = new TokenEmbedder(); + embedder.embedTokensOfEditScripts(); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step3.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step3.java new file mode 100644 index 0000000..c88630a --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step3.java @@ -0,0 +1,24 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for features learning of selected edit scripts. + * + * Vectorize edit scripts with embedded tokens of edit scripts. + * + * @author kui.liu + * + */ +public class Step3 { + + public static void main(String[] args) { + String vectorizedEditScripts = Configuration.VECTORIED_EDIT_SCRIPTS; + FileHelper.deleteFile(vectorizedEditScripts); + + DataPreparation.prepareDataForFeatureLearning(); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step4.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step4.java new file mode 100644 index 0000000..7b168ce --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step4.java @@ -0,0 +1,25 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.FeatureLearner; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Learn features of all selected edit scripts with CNN algorithm. + * + * Input data: vectorized edit scripts. + * + * @author kui.liu + * + */ +public class Step4 { + + public static void main(String[] args) { + String extractedFeatures = Configuration.EXTRACTED_FEATURES; + FileHelper.deleteDirectory(extractedFeatures); + + FeatureLearner learner = new FeatureLearner(); + learner.learnFeatures(); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step5.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step5.java new file mode 100644 index 0000000..fd94449 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step5.java @@ -0,0 +1,24 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for clustering of edit scripts. + * + * Input data: learned features of edit scripts by CNN. + * + * @author kui.liu + * + */ +public class Step5 { + + public static void main(String[] args) { + String clusterInput = Configuration.CLUSTER_INPUT; + FileHelper.deleteFile(clusterInput); + + DataPreparation.prepareDataForClustering(); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step6.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step6.java new file mode 100644 index 0000000..707b82b --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step6.java @@ -0,0 +1,23 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import edu.lu.uni.serval.FixPatternMining.Cluster; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Clustering of edit scripts with extracted features of edit scripts. + * + * @author kui.liu + * + */ +public class Step6 { + + public static void main(String[] args) { + String clusterOutput = Configuration.CLUSTER_OUTPUT; + FileHelper.deleteFile(clusterOutput); + + Cluster cluster = new Cluster(); + cluster.cluster(); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step7.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step7.java new file mode 100644 index 0000000..1292f9b --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step7.java @@ -0,0 +1,47 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.util.List; +import java.util.Map; + +import edu.lu.uni.serval.FixPatternMining.ClusterAnalyser; +import edu.lu.uni.serval.FixPatternMining.CommonPatterns; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Analyze cluster results to obtain common fix patterns. + * + * @author kui.liu + * + */ +public class Step7 { + + public static void main(String[] args) { + String clusteredPatches = Configuration.CLUSTERED_PATCHES_FILE; + String clusteredBuggyTokens = Configuration.CLUSTERED_TOKENSS_FILE; + FileHelper.deleteDirectory(clusteredPatches); + FileHelper.deleteDirectory(clusteredBuggyTokens); + + // analyze cluster results. + ClusterAnalyser analyser = new ClusterAnalyser(); + analyser.readClusterResutls(); + analyser.clusterPatchSourceCode(); + analyser.clusterBuggyCodeTokens(); // the results will be used to compute similarity with target java code to localize bugs. + + List clusterResults = analyser.getClusterResults(); + + // Common patterns. + CommonPatterns commonPatterns = new CommonPatterns(); // Metrics TODO + // : + Map commonClustersMappingLabel = commonPatterns.identifyCommonPatterns(clusterResults); + String clusterMappingLabel = "Label : ClusterNum\n"; + for (Map.Entry entry : commonClustersMappingLabel.entrySet()) { + clusterMappingLabel += entry.getValue() + " : " + entry.getKey() + "\n"; + } + FileHelper.outputToFile(Configuration.CLUSTERNUMBER_LABEL_MAP, clusterMappingLabel, false); + + int totalNumberOfTrainingData = commonPatterns.getTotalNumberofTrainingData(); + FileHelper.outputToFile(Configuration.NUMBER_OF_TRAINING_DATA, "" + totalNumberOfTrainingData, false); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step8.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step8.java new file mode 100644 index 0000000..e1bd185 --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step8.java @@ -0,0 +1,33 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.io.File; + +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.evaluation.ProjectScanner; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare testing data for evaluation. + * + * Parse java projects to get the token vectors of all statements. + * + * @author kui.liu + * + */ +public class Step8 { + + public static void main(String[] args) { + String outputLocalizeFile = Configuration.TEST_POSITION_FILE; + String outputTokensFile = Configuration.TEST_DATA_FILE; + FileHelper.deleteDirectory(outputLocalizeFile); + FileHelper.deleteDirectory(outputTokensFile); + + int limitationOfTestingInstances = Integer.parseInt(FileHelper.readFile(Configuration.NUMBER_OF_TRAINING_DATA).trim()) / 10; + + File testProjects = new File(Configuration.TEST_INPUT); + File[] projects = testProjects.listFiles(); + ProjectScanner scanner = new ProjectScanner(); + scanner.scanJavaProject(projects, outputLocalizeFile, outputTokensFile, limitationOfTestingInstances); + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step9.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step9.java new file mode 100644 index 0000000..474e9ca --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/App/Step9.java @@ -0,0 +1,33 @@ +package edu.lu.uni.serval.FixPatternMining.App; + +import java.util.Map; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for evaluation. + * + * Merge token vectors of source code of training data and testing data. + * + * @author kui.liu + * + */ +public class Step9 { + + public static void main(String[] args) { + boolean isSupervisedLearning = true; + if (isSupervisedLearning) {// supervised learning + Map commonClustersMappingLabel = DataPreparation.readCommonCLusters(); + + String outputFile = Configuration.EMBEDDING_DATA_TOKENS2; + FileHelper.deleteFile(outputFile); + // Data merge + DataPreparation.prepareTokensForEvaluation2(commonClustersMappingLabel); + } else { // un-supervised learning + // Data merge + DataPreparation.prepareTokensForEvaluation1(); + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/Cluster.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/Cluster.java new file mode 100644 index 0000000..55beafa --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/Cluster.java @@ -0,0 +1,45 @@ +package edu.lu.uni.serval.FixPatternMining; + +import edu.lu.uni.serval.Clusters.XMeansCluster; +import edu.lu.uni.serval.config.Configuration; +import weka.core.EuclideanDistance; + +/** + * Cluster features with X-means clustering algorithm. + * + * @author kui.liu + * + */ +public class Cluster { + + public void cluster() { + String arffFile = Configuration.CLUSTER_INPUT; + String clusterResults = Configuration.CLUSTER_OUTPUT; + + XMeansCluster cluster = new XMeansCluster(); + try { + /* + * The below 5 parameters have default values. + */ + cluster.setDistanceF(new EuclideanDistance()); + cluster.setUseKDTree(true); + cluster.setMaxNumberOfIterations(1000); + // The below 2 parameters are recommended to be the same. + cluster.setMaxKMeans(200); + cluster.setMaxKMeansForChildren(200); + + /* + * The values of the below 3 parameters should be set by developers. + */ + cluster.setSeed(200); + cluster.setMaxNumClusters(100); + cluster.setMinNumClusters(1); + + // X-means clustering is beginning. + cluster.cluster(arffFile, clusterResults); + // X-means clustering is finished. + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterAnalyser.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterAnalyser.java new file mode 100644 index 0000000..bd9061b --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterAnalyser.java @@ -0,0 +1,146 @@ +package edu.lu.uni.serval.FixPatternMining; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.utils.FileHelper; + +public class ClusterAnalyser { + + private List clusterResults; // each element is a cluster number. + + public void readClusterResutls() { + clusterResults = DataPreparation.readClusterResults(); + } + + public void clusterBuggyCodeTokens() { + String selectedTokens = Configuration.SELECTED_BUGGY_TOKEN_FILE; + String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE; + + FileInputStream fis = null; + Scanner scanner = null; + + Map builderMap = new HashMap<>(); + Map countersMap = new HashMap<>(); + try { + fis = new FileInputStream(selectedTokens); + scanner = new Scanner(fis); + int index = 0; + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + int clusterNum = clusterResults.get(index); + StringBuilder builder = getBuilder(builderMap, clusterNum); + builder.append(line).append("\n"); + int counter = getCounter(countersMap, clusterNum); + if (counter % 1000 == 0) { + FileHelper.outputToFile(clusteredTokens + "Tokens_" + clusterNum + ".list", builder, true); + builder.setLength(0); + builderMap.put(clusterNum, builder); + } + index ++; + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + scanner.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + for (Map.Entry entry : builderMap.entrySet()) { + int clusterNum = entry.getKey(); + StringBuilder builder = entry.getValue(); + FileHelper.outputToFile(clusteredTokens + "Tokens_" + clusterNum + ".list", builder, true); + builder.setLength(0); + } + } + + public void clusterPatchSourceCode() { + String selectedPatches = Configuration.SELECTED_PATCHES_SOURE_CODE_FILE; + String clusteredPatches = Configuration.CLUSTERED_PATCHES_FILE; + + FileInputStream fis = null; + Scanner scanner = null; + + Map builderMap = new HashMap<>(); + Map countersMap = new HashMap<>(); + try { + fis = new FileInputStream(selectedPatches); + scanner = new Scanner(fis); + String singlePatch = ""; + int index = -1; + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if ("".equals(line)) continue; + if ("PATCH###".equals(line)) { + if (!"".equals(singlePatch)) { + int clusterNum = clusterResults.get(index); + StringBuilder builder = getBuilder(builderMap, clusterNum); + builder.append(singlePatch); + int counter = getCounter(countersMap, clusterNum); + if (counter % 1000 == 0) { + FileHelper.outputToFile(clusteredPatches + "PatchesCluster_" + clusterNum + ".list", builder, true); + builder.setLength(0); + builderMap.put(clusterNum, builder); + } + } + singlePatch = ""; + index ++; + } + singlePatch += line + "\n"; + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + scanner.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + for (Map.Entry entry : builderMap.entrySet()) { + int clusterNum = entry.getKey(); + StringBuilder builder = entry.getValue(); + FileHelper.outputToFile(clusteredPatches + "PatchesCluster_" + clusterNum + ".list", builder, true); + builder.setLength(0); + } + } + + private int getCounter(Map countersMap, int clusterNum) { + int counter = 1; + if (countersMap.containsKey(clusterNum)) { + counter += countersMap.get(clusterNum); + } + countersMap.put(clusterNum, counter); + return counter; + } + + private StringBuilder getBuilder(Map builderMap, int clusterNum) { + if (builderMap.containsKey(clusterNum)) { + return builderMap.get(clusterNum); + } else { + StringBuilder builder = new StringBuilder(); + builderMap.put(clusterNum, builder); + return builder; + } + } + + public List getClusterResults() { + return clusterResults; + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterResults.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterResults.java new file mode 100644 index 0000000..481c89b --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/ClusterResults.java @@ -0,0 +1,35 @@ +package edu.lu.uni.serval.FixPatternMining; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import edu.lu.uni.serval.utils.FileHelper; + +public class ClusterResults { + + /** + * Read the cluster results from the file of cluster results. + * + * @param clusterResultsFile, the file of cluster results. + * @return List, each integer is a cluster number. + * @throws IOException + */ + public static List readClusterResults(File clusterResultsFile) throws IOException { + List clusterResultsList = new ArrayList<>(); + String clusterResults = FileHelper.readFile(clusterResultsFile); + BufferedReader reader = new BufferedReader(new StringReader(clusterResults)); + + String line = null; + while ((line = reader.readLine()) != null) { + int cluster = Integer.parseInt(line); + clusterResultsList.add(cluster); + } + + reader.close(); + return clusterResultsList; + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/CommonPatterns.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/CommonPatterns.java new file mode 100644 index 0000000..e8db8dc --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/CommonPatterns.java @@ -0,0 +1,72 @@ +package edu.lu.uni.serval.FixPatternMining; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.utils.MapSorter; + +public class CommonPatterns { + + private static final int LEAST_NUMBER = 100; + private int totalNumberofTrainingData = 0; + + public Map identifyCommonPatterns(List clusterResults) { + Map> clusterMap = DataPreparation.readClusterResult(clusterResults); + // TODO how to select the common patterns, number or ratio? + List commonClusterNum = getCommonClustersByNumber(clusterMap); // Integer: clusterNum. + + Map clusterNumMapLabel = new HashMap<>(); // + for (int i = 0, size = commonClusterNum.size(); i < size; i ++) { + clusterNumMapLabel.put(commonClusterNum.get(i), i); + } + + return clusterNumMapLabel; + } + + private List getCommonClustersByNumber(Map> clusterMap) { + List commonClusterNum = new ArrayList<>(); + + for (Map.Entry> entry : clusterMap.entrySet()) { + List elements = entry.getValue(); + int size = elements.size(); + if (size >= LEAST_NUMBER) { // TODO how to set this threshold? + commonClusterNum.add(entry.getKey()); + totalNumberofTrainingData += size; + } + } + + return commonClusterNum; + } + + private List getCommonClustersByRatio(Map> clusterMap, List clusterResults) { + List commonClusterNum = new ArrayList<>(); + + double sizes = (double) clusterResults.size(); + Map ratios = new HashMap<>(); + for (Map.Entry> entry : clusterMap.entrySet()) { + List elements = entry.getValue(); + ratios.put(entry.getKey(), (double) elements.size() / sizes); + } + MapSorter sorter = new MapSorter(); + ratios = sorter.sortByValueDescending(ratios); + double counterRatio = 0.0; + for (Map.Entry entry : ratios.entrySet()) { + counterRatio += entry.getValue(); + commonClusterNum.add(entry.getKey()); + totalNumberofTrainingData += clusterMap.get(entry.getKey()).size(); + if (counterRatio >= 0.8) { // TODO: how to set the value of this threshold? + break; + } + } + + return commonClusterNum; + } + + public int getTotalNumberofTrainingData() { + return totalNumberofTrainingData; + } + +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/DataPreparation.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/DataPreparation.java new file mode 100644 index 0000000..ce03dcf --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/DataPreparation.java @@ -0,0 +1,560 @@ +package edu.lu.uni.serval.FixPatternMining.DataPrepare; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.MaxSizeSelector.MaxSizeType; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.data.DataPreparer; +import edu.lu.uni.serval.utils.FileHelper; + +/** + * Prepare data for fix patterns mining and evaluation. + * + * @author kui.liu + * + */ +public class DataPreparation { + + /** + * Prepare data for token embedding in the process of fix patterns mining. + */ + public static void prepareDataForTokenEmbedding() { + // Collect all data into one file. + String editScriptsFilePath = Configuration.EDITSCRIPTS_FILE_PATH; + String patchesSourceCodeFilePath = Configuration.PATCH_SOURCECODE_FILE_PATH; + String buggyTokensFilePath = Configuration.BUGGYTREE_FILE_PATH; + String editScriptSizesFilePath = Configuration.EDITSCRIPT_SIZES_FILE_PATH; + + String editScriptsFile = Configuration.EDITSCRIPT_SIZES_FILE; + String patchesSourceCodeFile = Configuration.PATCH_SOURCECODE_FILE; + String buggyTokensFile = Configuration.BUGGY_CODY_TOKENS_FILE; + String editScriptSizesFile = Configuration.EDITSCRIPT_SIZES_FILE; + File file = new File(editScriptsFilePath); + File[] subFiles = file.listFiles(); + + // Merge results of parsed patches. + for (File subFile : subFiles) { + String fileName = subFile.getName(); // edistScripts file + String id = fileName.substring(fileName.lastIndexOf("_")); + FileHelper.outputToFile(editScriptsFile, FileHelper.readFile(subFile), true); + String patchesSourceCode = patchesSourceCodeFilePath + "patches" + id; + FileHelper.outputToFile(patchesSourceCodeFile, FileHelper.readFile(patchesSourceCode), true); + String sizes = editScriptSizesFile + "sizes" + id; + FileHelper.outputToFile(editScriptSizesFilePath, FileHelper.readFile(sizes), true); + String buggyTokens = buggyTokensFilePath + "tokens" + id; + FileHelper.outputToFile(buggyTokensFile, FileHelper.readFile(buggyTokens), true); + } + + + // Select data by the size of edit script vectors. + List sizesList; + try { + sizesList = MaxSizeSelector.readSizes(editScriptSizesFile); + int maxSize = MaxSizeSelector.selectMaxSize(MaxSizeType.ThirdQuartile, sizesList); + List outlierIndexes = new ArrayList<>(); + for (int i = 0, size = sizesList.size(); i < size; i ++) { + if (sizesList.get(i) > maxSize) { + outlierIndexes.add(i); + } + } + FileHelper.outputToFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS, "" + maxSize, false); + + selectData(editScriptsFile, outlierIndexes, Configuration.SELECTED_EDITSCRIPTES_FILE); + selectData(patchesSourceCodeFile, outlierIndexes, Configuration.PATCH_SIGNAL, Configuration.SELECTED_PATCHES_SOURE_CODE_FILE); + int maxTokenVectorSize = selectDataOfSourceCodeTokens(buggyTokensFile, outlierIndexes, Configuration.SELECTED_BUGGY_TOKEN_FILE); + FileHelper.outputToFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE, "" + maxTokenVectorSize, false); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void selectData(String intputFile, List outlierIndexList, String outputFile) { + List outlierIndexes = new ArrayList<>(); + outlierIndexes.addAll(outlierIndexList); + FileInputStream fis = null; + Scanner scanner = null; + try { + fis = new FileInputStream(intputFile); + scanner = new Scanner(fis); + int index = 0; + StringBuilder builder = new StringBuilder(); + int counter = 0; + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if (outlierIndexes.contains(index)) { + outlierIndexes.remove(new Integer(index)); + } else { + builder.append(line + "\n"); + if (++ counter % 100000 == 0) { + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + } + index ++; + } + + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + if (scanner != null) { + scanner.close(); + scanner = null; + } + if (fis != null) { + fis.close(); + fis = null; + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + private static void selectData(String inputFile, List outlierIndexes, String startingSignal, String outputFile) { + FileInputStream fis = null; + Scanner scanner = null; + try { + fis = new FileInputStream(inputFile); + scanner = new Scanner(fis); + int index = -1; + StringBuilder builder = new StringBuilder(); + int counter = 0; + String singleEntity = ""; + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if (line.equals(startingSignal)) { + if (!"".equals(singleEntity)) { + if (outlierIndexes.contains(index)) { + outlierIndexes.remove(new Integer(index)); + } else { + builder.append(singleEntity + "\n"); + if (++ counter % 100000 == 0) { + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + } + singleEntity = ""; + } + index ++; + } + singleEntity += line + "\n"; + } + + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + if (scanner != null) { + scanner.close(); + scanner = null; + } + if (fis != null) { + fis.close(); + fis = null; + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + private static int selectDataOfSourceCodeTokens(String inputFile, List outlierIndexList, String outputFile) { + List outlierIndexes = new ArrayList<>(); + outlierIndexes.addAll(outlierIndexList); + FileInputStream fis = null; + Scanner scanner = null; + int size = 0; + try { + fis = new FileInputStream(inputFile); + scanner = new Scanner(fis); + int index = 0; + StringBuilder builder = new StringBuilder(); + int counter = 0; + + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + if (outlierIndexes.contains(index)) { + outlierIndexes.remove(new Integer(index)); + } else { + builder.append(line + "\n"); + if (++ counter % 100000 == 0) { + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + String[] tokens = line.split(" "); + if (tokens.length > size) size = tokens.length; + } + index ++; + } + + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + if (scanner != null) { + scanner.close(); + scanner = null; + } + if (fis != null) { + fis.close(); + fis = null; + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + return size; + } + + /** + * Prepare data for feature learning. + */ + public static void prepareDataForFeatureLearning() { + String zeroVector = ""; + for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1 - 1; i < length; i ++) { + zeroVector += "0, "; + } + zeroVector += "0"; + int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS).trim()); + + String embeddedTokensFile = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS; + Map embeddedTokens = readEmbeddedTokens(embeddedTokensFile); + + String editScriptsFile = Configuration.SELECTED_EDITSCRIPTES_FILE; + String outputFile = Configuration.VECTORIED_EDIT_SCRIPTS; + dataPrepare(editScriptsFile, maxSize, outputFile, embeddedTokens, zeroVector); + } + + private static Map readEmbeddedTokens(String embeddedTokensFile) { + Map embeddedTokens = new HashMap<>(); + File file = new File(embeddedTokensFile); + FileInputStream fis = null; + Scanner scanner = null; + try { + fis = new FileInputStream(file); + scanner = new Scanner(fis); + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + int firstBlankIndex = line.indexOf(" "); + String token = line.substring(0, firstBlankIndex); + String value = line.substring(firstBlankIndex + 1).replaceAll(" ", ", "); + embeddedTokens.put(token, value); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + scanner.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + return embeddedTokens; + } + + private static void dataPrepare(String inputFile, int maxSize, String outputFile, Map embeddedTokens, String zeroVector) { + File file = new File(inputFile); + FileInputStream fis = null; + Scanner scanner = null; + StringBuilder builder = new StringBuilder(); + int counter = 0; + + try { + fis = new FileInputStream(file); + scanner = new Scanner(fis); + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + StringBuilder vectorStr = convertToVector(embeddedTokens, line, maxSize, zeroVector); + builder.append(vectorStr); + if (++ counter % 10000 == 0) { + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + scanner.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + + private static StringBuilder convertToVector(Map embeddedTokens, String line, int maxSize, String zeroVector) { + String[] tokens = line.split(" "); + StringBuilder vectorStr = new StringBuilder(); + int length = tokens.length; + if (length == maxSize) { + for (int i = 0; i < length - 1; i ++) { + String token = tokens[i]; + vectorStr.append(embeddedTokens.get(token) + ", "); + } + vectorStr.append(embeddedTokens.get(tokens[length - 1]) + "\n"); + } else { + for (int i = 0; i < length; i ++) { + String token = tokens[i]; + vectorStr.append(embeddedTokens.get(token) + ", "); + } + for (int i = length; i < maxSize - 1; i ++) { + vectorStr.append(zeroVector + ", "); + } + vectorStr.append(zeroVector + "\n"); + } + + return vectorStr; + } + + /** + * Prepare data for clustering. + */ + public static void prepareDataForClustering() { + String featureFile = Configuration.EXTRACTED_FEATURES + "vectorizedEditScripts.csv"; + String arffFile = Configuration.CLUSTER_INPUT; + DataPreparer.prepareData(featureFile, arffFile); + } + + /** + * Read cluster results. + */ + public static List readClusterResults() { + List clusterResults = new ArrayList<>(); + String clusterResultsFile = Configuration.CLUSTER_OUTPUT; + String results = FileHelper.readFile(clusterResultsFile); + BufferedReader reader = null; + try { + reader = new BufferedReader(new StringReader(results)); + String line = null; + while ((line = reader.readLine()) != null) { + clusterResults.add(Integer.parseInt(line)); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + try { + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return clusterResults; + } + + public static Map> readClusterResult(List clusterResults) { + Map> clusters = new HashMap<>(); + + for (int i = 0, size = clusterResults.size(); i < size; i ++) { + int clusterNo = clusterResults.get(i); + if (clusters.containsKey(clusterNo)) { + clusters.get(clusterNo).add(i + 1); + } else { + List newCLuster = new ArrayList<>(); + newCLuster.add(i + 1); + clusters.put(clusterNo, newCLuster); + } + } + + return clusters; + } + + /** + * Data for un-supervised learning. + */ + public static void prepareTokensForEvaluation1() { + String outputFile = Configuration.EMBEDDING_DATA_TOKENS1; + FileHelper.outputToFile(outputFile, FileHelper.readFile(Configuration.SELECTED_BUGGY_TOKEN_FILE), false); + List files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list"); + for (File file : files) { + FileHelper.outputToFile(outputFile, FileHelper.readFile(file), true); + } + } + + public static void prepareDataForFeatureLearningOfEvaluation1() { + String zeroVector = ""; + for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2 - 1; i < length; i ++) { + zeroVector += "0, "; + } + zeroVector += "0"; + int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE)); + + String allEmbeddedTokens = Configuration.EMBEDDED_ALL_TOKENS1; + Map embeddedTokens = readEmbeddedTokens(allEmbeddedTokens); + + // Testing data + String clusteredTokens = Configuration.TEST_DATA_FILE; + List files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list"); + for (File file : files) { + + } + String allTokensOfSourceCode = Configuration.EMBEDDING_DATA_TOKENS1; // TODO testing data should be separated. + dataPrepare(allTokensOfSourceCode, maxSize, Configuration.VECTORIED_ALL_SOURCE_CODE1, embeddedTokens, zeroVector); + } + + /** + * Data for supervised learning. + */ + public static void prepareTokensForEvaluation2(Map commonClustersMappingLabel) { + String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE; + String outputFile = Configuration.EMBEDDING_DATA_TOKENS2; + + List files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list"); + for (File file : files) { + String fileName = file.getName(); + String clusterNumStr = fileName.substring(fileName.lastIndexOf("_") + 1, fileName.lastIndexOf(".list")); + int clusterNum = Integer.parseInt(clusterNumStr); + if (commonClustersMappingLabel.containsKey(clusterNum)) { + String content = FileHelper.readFile(file); + FileHelper.outputToFile(outputFile, content, true); + } + } + files.clear(); + files = FileHelper.getAllFilesInCurrentDiectory(Configuration.TEST_DATA_FILE, ".list"); + for (File file : files) { + FileHelper.outputToFile(outputFile, FileHelper.readFile(file), true); + } + } + + public static void prepareDataForFeatureLearningOfEvaluation2(Map commonClustersMappingLabel) { + String zeroVector = ""; + for (int i =0, length = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2 - 1; i < length; i ++) { + zeroVector += "0, "; + } + zeroVector += "0"; + + String allEmbeddedTokensOfEvaluation = Configuration.EMBEDDED_ALL_TOKENS2; + Map embeddedTokens = readEmbeddedTokens(allEmbeddedTokensOfEvaluation); + + int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE)); + // Training data + String clusteredTokens = Configuration.CLUSTERED_TOKENSS_FILE; + List files = FileHelper.getAllFilesInCurrentDiectory(clusteredTokens, ".list"); + for (File file : files) { + String fileName = file.getName(); + String clusterNumStr = fileName.substring(fileName.lastIndexOf("_") + 1, fileName.lastIndexOf(".list")); + int clusterNum = Integer.parseInt(clusterNumStr); + if (commonClustersMappingLabel.containsKey(clusterNum)) { + dataPrepare(file.getPath(), maxSize, Configuration.TRAINING_DATA, embeddedTokens, zeroVector, clusterNum); + } + } + // Testing data + files.clear(); + String testingData = Configuration.TEST_DATA_FILE; + files = FileHelper.getAllFilesInCurrentDiectory(testingData, ".list"); + String testingDataPath = Configuration.TESTING_DATA; + for (File file : files) { + String fileName = file.getName(); + fileName.replace(".list", ".csv"); + dataPrepare(file.getPath(), maxSize, testingDataPath + fileName, embeddedTokens, zeroVector, 0); + } + } + + private static void dataPrepare(String inputFile, int maxSize, String outputFile, Map embeddedTokens, + String zeroVector, int clusterNum) { + FileInputStream fis = null; + Scanner scanner = null; + StringBuilder builder = new StringBuilder(); + int counter = 0; + + try { + fis = new FileInputStream(inputFile); + scanner = new Scanner(fis); + while (scanner.hasNextLine()) { + String line = scanner.nextLine(); + StringBuilder vectorStr = convertToVector(embeddedTokens, line, maxSize, zeroVector, clusterNum); + builder.append(vectorStr); + if (++ counter % 10000 == 0) { + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } finally { + try { + scanner.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + FileHelper.outputToFile(outputFile, builder, true); + builder.setLength(0); + } + + private static StringBuilder convertToVector(Map embeddedTokens, String line, int maxSize, String zeroVector, int clusterNum) { + String[] tokens = line.split(" "); + StringBuilder vectorStr = new StringBuilder(); + int length = tokens.length; + if (length == maxSize) { + for (int i = 0; i < length; i ++) { + String token = tokens[i]; + vectorStr.append(embeddedTokens.get(token) + ", "); + } + } else { + for (int i = 0; i < length; i ++) { + String token = tokens[i]; + vectorStr.append(embeddedTokens.get(token) + ", "); + } + for (int i = length; i < maxSize; i ++) { + vectorStr.append(zeroVector + ", "); + } + } + + vectorStr.append(clusterNum + "\n"); + + return vectorStr; + } + + public static Map readCommonCLusters() { + Map commonClustersMappingLabel = new HashMap<>(); + String commonClusters = FileHelper.readFile(Configuration.CLUSTERNUMBER_LABEL_MAP); + BufferedReader reader = null; + try { + reader = new BufferedReader(new StringReader(commonClusters)); + String line = reader.readLine(); + while ((line = reader.readLine()) != null) { + String[] strArray = line.split(" : "); + int key = Integer.parseInt(strArray[1]); + int value = Integer.parseInt(strArray[0]); + commonClustersMappingLabel.put(key, value); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + try { + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return commonClustersMappingLabel; + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/MaxSizeSelector.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/MaxSizeSelector.java new file mode 100644 index 0000000..1724dde --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/DataPrepare/MaxSizeSelector.java @@ -0,0 +1,66 @@ +package edu.lu.uni.serval.FixPatternMining.DataPrepare; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import edu.lu.uni.serval.utils.FileHelper; +import edu.lu.uni.serval.utils.ListSorter; + +public class MaxSizeSelector { + + public enum MaxSizeType { + UpperWhisker, ThirdQuartile + } + + public static List readSizes(String sizeFilePath) throws IOException { + List sizes = new ArrayList<>(); + String sizesStr = FileHelper.readFile(sizeFilePath); + BufferedReader br = new BufferedReader(new StringReader(sizesStr)); + String line = null; + + while ((line = br.readLine()) != null) { + sizes.add(Integer.parseInt(line.trim())); + } + + return sizes; + } + + public static int selectMaxSize(MaxSizeType maxSizeType, List sizesDistribution) { + int maxSize = 0; + switch (maxSizeType) { + case UpperWhisker: + maxSize = upperWhisker(sizesDistribution); + break; + case ThirdQuartile: + maxSize = thirdQuarter(sizesDistribution); + break; + } + return maxSize; + } + + private static int upperWhisker(List sizesDistribution) { + List sizes = new ArrayList<>(); + sizes.addAll(sizesDistribution); + ListSorter sorter = new ListSorter(sizes); + sizesDistribution = sorter.sortAscending(); + int firstQuarterIndex = sizesDistribution.size() * 25 / 100; + int firstQuarter = sizesDistribution.get(firstQuarterIndex); + int thirdQuarterIndex = sizesDistribution.size() * 75 / 100; + int thirdQuarter = sizesDistribution.get(thirdQuarterIndex); + int upperWhisker = thirdQuarter + (int) (1.5 * (thirdQuarter - firstQuarter)); + return upperWhisker; + } + + private static int thirdQuarter(List sizesDistribution) { + List sizes = new ArrayList<>(); + sizes.addAll(sizesDistribution); + ListSorter sorter = new ListSorter(sizes); + sizesDistribution = sorter.sortAscending(); + int thirdQuarterIndex = sizesDistribution.size() * 75 / 100; + int thirdQuarter = sizesDistribution.get(thirdQuarterIndex); + return thirdQuarter; + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/FeatureLearner.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/FeatureLearner.java new file mode 100644 index 0000000..6d717ec --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/FeatureLearner.java @@ -0,0 +1,121 @@ +package edu.lu.uni.serval.FixPatternMining; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import edu.lu.uni.serval.FixPatternMining.DataPrepare.DataPreparation; +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.deeplearner.CNNFeatureExtractor2; +import edu.lu.uni.serval.deeplearner.CNNSupervisedLearning; +import edu.lu.uni.serval.utils.FileHelper; + +public class FeatureLearner { + + /** + * Learn features of edit scripts for fix patterns mining. + */ + public void learnFeatures() { + String editScriptsVectorFile = Configuration.VECTORIED_EDIT_SCRIPTS; // input + int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS).trim()); + int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1; + int batchSize = 1000; + int sizeOfFeatureVector = 200; + + try { + CNNFeatureExtractor2 learner = new CNNFeatureExtractor2(new File(editScriptsVectorFile), sizeOfVector, sizeOfTokenVec, batchSize, sizeOfFeatureVector); + learner.setNumberOfEpochs(20); + learner.setSeed(123); + learner.setNumOfOutOfLayer1(20); + learner.setNumOfOutOfLayer2(50); + learner.setOutputPath(Configuration.EXTRACTED_FEATURES); + + learner.extracteFeaturesWithCNN(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + public void learnFeaturesOfSourceCode() { + int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE)); + int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2; + int batchSize = 1000; + int sizeOfExtractedFeatureVector = 200; + + try { + CNNFeatureExtractor2 learner = new CNNFeatureExtractor2(new File(Configuration.VECTORIED_ALL_SOURCE_CODE1), sizeOfVector, sizeOfTokenVec, batchSize, sizeOfExtractedFeatureVector); + learner.setNumberOfEpochs(20); + learner.setSeed(123); + learner.setNumOfOutOfLayer1(20); + learner.setNumOfOutOfLayer2(50); + learner.setOutputPath(Configuration.EXTRACTED_FEATURES_EVALUATION); + + learner.extracteFeaturesWithCNN(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + /** + * Supervised learning. + */ + public void learnFeaturesOfSourceCode2(File testingData) { + int sizeOfVector = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE)); + int sizeOfTokenVec = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2; + int batchSize = 1000; + int sizeOfExtractedFeatureVector = 200; + + try { + int clusterNum = DataPreparation.readCommonCLusters().size(); + File trainingData = new File(Configuration.TRAINING_DATA); + CNNSupervisedLearning learner = new CNNSupervisedLearning(trainingData, sizeOfVector, + sizeOfTokenVec, batchSize, sizeOfExtractedFeatureVector, clusterNum, testingData); + learner.setNumberOfEpochs(20); + learner.setSeed(123); + learner.setNumOfOutOfLayer1(20); + learner.setNumOfOutOfLayer2(50); + learner.setOutputPath(Configuration.FEATURES_OF_TRAINING_DATA); + learner.setFeatresOfTestingData(Configuration.FEATURES_OF_TESTING_DATA); + learner.setPossibilitiesOfPrediction(Configuration.POSSIBILITIES_OF_TESTING_DATA); + learner.setPredictedResultsOfTestingData(Configuration.PREDICTED_RESULTS_OF_TESTING_DATA); + learner.setModelFile(Configuration.SUPERVISED_LEARNING_MODEL); + learner.extracteFeaturesWithCNN(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + /** + * Supervised learning by loading a model. + */ + public void learnFeaturesOfSourceCode3(File testingData) { + int batchSize = 1000; + + try { + String modelFile = Configuration.SUPERVISED_LEARNING_MODEL; + CNNSupervisedLearning learner = new CNNSupervisedLearning(batchSize, testingData, modelFile); + learner.setFeatresOfTestingData(Configuration.FEATURES_OF_TESTING_DATA); + learner.setPossibilitiesOfPrediction(Configuration.POSSIBILITIES_OF_TESTING_DATA); + learner.setPredictedResultsOfTestingData(Configuration.PREDICTED_RESULTS_OF_TESTING_DATA); + learner.extracteFeaturesWithCNNByLoadingModel(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/edu/lu/uni/serval/FixPatternMining/TokenEmbedder.java b/src/main/java/edu/lu/uni/serval/FixPatternMining/TokenEmbedder.java new file mode 100644 index 0000000..b980ded --- /dev/null +++ b/src/main/java/edu/lu/uni/serval/FixPatternMining/TokenEmbedder.java @@ -0,0 +1,65 @@ +package edu.lu.uni.serval.FixPatternMining; + +import java.io.File; +import java.io.IOException; + +import edu.lu.uni.serval.config.Configuration; +import edu.lu.uni.serval.deeplearner.Word2VecEncoder; + +/** + * Encode tokens of edit scripts with Word2Vec. + * + * @author kui.liu + * + */ +public class TokenEmbedder { + + /** + * Embed tokens for fix patterns mining. + */ + public void embedTokensOfEditScripts() { + Word2VecEncoder encoder = new Word2VecEncoder(); + int windowSize = 2; + encoder.setWindowSize(windowSize); + try { + File inputFile = new File(Configuration.SELECTED_EDITSCRIPTES_FILE); + int minWordFrequency = 1; + int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN1; + String outputFileName = Configuration.EMBEDDED_EDIT_SCRIPT_TOKENS; + encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void embedTokensOfSourceCodeForSupervisedTesting() { + Word2VecEncoder encoder = new Word2VecEncoder(); + int windowSize = 2; + encoder.setWindowSize(windowSize); + try { + File inputFile = new File(Configuration.EMBEDDING_DATA_TOKENS2); + int minWordFrequency = 1; + int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2; + String outputFileName = Configuration.EMBEDDED_ALL_TOKENS2; + encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void embedTokensOfSourceCodeForUnsupervisedTesting() { + Word2VecEncoder encoder = new Word2VecEncoder(); + int windowSize = 2; + encoder.setWindowSize(windowSize); + try { + File inputFile = new File(Configuration.EMBEDDING_DATA_TOKENS1); + int minWordFrequency = 1; + int layerSize = Configuration.VECTOR_SIZE_OF_EMBEDED_TOKEN2; + String outputFileName = Configuration.EMBEDDED_ALL_TOKENS1; + encoder.embedTokens(inputFile, minWordFrequency, layerSize, outputFileName); + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/src/main/java/edu/lu/uni/serval/config/Configuration.java b/src/main/java/edu/lu/uni/serval/config/Configuration.java index ecba9f9..4fca1e5 100644 --- a/src/main/java/edu/lu/uni/serval/config/Configuration.java +++ b/src/main/java/edu/lu/uni/serval/config/Configuration.java @@ -24,13 +24,13 @@ public class Configuration { public static final String BUGGY_CODY_TOKENS_FILE = GUM_TREE_OUTPUT + "tokens.list"; public static final String EDITSCRIPT_SIZES_FILE = GUM_TREE_OUTPUT + "editScriptSizes.list"; - public static int MAX_EDIT_SCRIPT_VECTOR_SIZE = 0; // The max size of edit script vectors. - public static int MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE = 0; // The max size of all buggy source code token vectors. public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN1 = 100; // tokens of edit scripts. public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN2 = 200; // tokens of source code // the input path of fix patterns mining. private static final String MINING_INPUT = ROOT_PATH + "MiningInput/"; + public static final String MAX_TOKEN_VECTORS_SIZE_OF_EDIT_SCRIPTS = MINING_INPUT + "/MaxTokenVectorSizeOfEditScripts.list"; // The max size of edit scripts: upper limitation of max size. + public static final String MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE = MINING_INPUT + "/MaxTokenVectorSizeOfBuggySourceCode.list"; // The max size of all buggy source code token vectors. // the input path of token embedding. public static final String EMBEDDING_INPUT = MINING_INPUT + "Embedding/"; public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";// Selected patches. @@ -42,7 +42,7 @@ public class Configuration { public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list"; // All embedded tokens of selected edit scripts. public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv"; // Embedded and vectorized edit script vectors. // the input path of clustering. - public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of edit scripts. + public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of all edit scripts. public static final String CLUSTER_INPUT = MINING_INPUT + "ClusteringInput/input.arff"; // the output path of fix patterns mining. @@ -53,20 +53,28 @@ public class Configuration { // evaluation data public static final String TEST_INPUT = ROOT_PATH + "TestProjects/"; - public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list"; // Positions of all test statements. - public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list"; // Token vectors of all test statements. + public static final String TEST_POSITION_FILE = ROOT_PATH + "TestData/Positions/"; // Positions of all test statements. + public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements/"; // Token vectors of all test statements. + + public static final String NUMBER_OF_TRAINING_DATA = ROOT_PATH + "TestData/NumberOfTrainingData.list";; // data of unsupervised learning public static final String EMBEDDING_DATA_TOKENS1 = ROOT_PATH + "TestData/AllTokenVectorsForEvaluation.list"; public static final String EMBEDDED_ALL_TOKENS1 = ROOT_PATH + "TestData/AllEmbeddedTokens.list"; - public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode.list"; - public static final String EXTRACTED_FEATURES_TESTING = ROOT_PATH + "TestDataExtractedFeatures/"; + public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode/"; + public static final String EXTRACTED_FEATURES_EVALUATION = ROOT_PATH + "TestDataExtractedFeatures/"; // extracted features of all source code (training data and testing data) // Data of supervised learning public static final String CLUSTERNUMBER_LABEL_MAP = ROOT_PATH + "TestData/clusterMappingLabel.list"; public static final String EMBEDDING_DATA_TOKENS2 = ROOT_PATH + "TestData/AllTokenVectorsForSupervisedEvaluation.list"; public static final String EMBEDDED_ALL_TOKENS2 = ROOT_PATH + "TestData/AllEmbeddedTokensForSuperVisedEvaluation.list"; public static final String TRAINING_DATA = ROOT_PATH + "TestData/TrainingData.csv"; // Training data of supervised learning - public static final String TESTING_DATA = ROOT_PATH + "TestData/TestingData.csv"; // testing data of supervised learning - + public static final String TESTING_DATA = ROOT_PATH + "TestData/SupervisedLearning/"; // testing data of supervised learning + public static final String FEATURES_OF_TRAINING_DATA = ROOT_PATH + "TestingOutput/TraingFeatures/"; + public static final String FEATURES_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/TestingFeatures/"; + public static final String POSSIBILITIES_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/Posibilities/"; + public static final String PREDICTED_RESULTS_OF_TESTING_DATA = ROOT_PATH + "TestingOutput/Prediction/"; + + public static final String SUPERVISED_LEARNING_MODEL = ROOT_PATH + "TestingOutput/SupervisedLearningModel.zip"; + } diff --git a/src/main/java/edu/lu/uni/serval/evaluation/ProjectScanner.java b/src/main/java/edu/lu/uni/serval/evaluation/ProjectScanner.java index b6948ba..0a34a4c 100644 --- a/src/main/java/edu/lu/uni/serval/evaluation/ProjectScanner.java +++ b/src/main/java/edu/lu/uni/serval/evaluation/ProjectScanner.java @@ -28,23 +28,17 @@ import edu.lu.uni.serval.utils.FileHelper; */ public class ProjectScanner { - public static void main(String[] args) { - String inputPath = Configuration.TEST_INPUT; //test java projects - File inputFileDirector = new File(inputPath); - File[] projects = inputFileDirector.listFiles(); // project folders - - String outputLocalizeFile = Configuration.TEST_LOCALIZATION_FILE; - String outputTokensFile = Configuration.TEST_DATA_FILE; - + private int maxSize = Integer.parseInt(FileHelper.readFile(Configuration.MAX_TOKEN_VECTORS_SIZE_OF_SOURCE_CODE)); + private int numberOfFiles = 0; + private List allSimpleTrees = new ArrayList<>(); + + public void scanJavaProject(File[] projects, String outputLocalizeFile, String outputTokensFile, int limitation) { for (File project : projects) { - ProjectScanner scanner = new ProjectScanner(); - scanner.scanJavaProject(project, outputLocalizeFile, outputTokensFile); + scanJavaProject(project, outputLocalizeFile, outputTokensFile, limitation); } } - - List allSimpleTrees = new ArrayList<>(); - public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile) { + public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile, int limitation) { List files = new ArrayList<>(); files.addAll(FileHelper.getAllFiles(javaProject.getPath(), ".java")); @@ -60,19 +54,24 @@ public class ProjectScanner { CUCreator cuCreator = new CUCreator(); CompilationUnit cUnit = cuCreator.createCompilationUnit(file); getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject.getPath(), file.getPath()); - - if (++ counter % 1000 == 0) { - FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true); - FileHelper.outputToFile(outputTokensFile, tokensBuilder, true); + ++ counter; + + if ( counter % limitation == 0) { + numberOfFiles ++; + FileHelper.outputToFile(outputLocalizeFile + "Positions" + numberOfFiles + ".list", localizationsBuilder, true); + FileHelper.outputToFile(outputTokensFile + "Tokens" + numberOfFiles + ".list", tokensBuilder, true); localizationsBuilder.setLength(0); tokensBuilder.setLength(0); } } - FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true); - FileHelper.outputToFile(outputTokensFile, tokensBuilder, true); - localizationsBuilder.setLength(0); - tokensBuilder.setLength(0); + if (localizationsBuilder.length() > 0) { + numberOfFiles ++; + FileHelper.outputToFile(outputLocalizeFile + "Positions" + numberOfFiles + ".list", localizationsBuilder, true); + FileHelper.outputToFile(outputTokensFile + "Tokens" + numberOfFiles + ".list", tokensBuilder, true); + localizationsBuilder.setLength(0); + tokensBuilder.setLength(0); + } } private void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) { @@ -98,7 +97,8 @@ public class ProjectScanner { // project name: file name: line number String tokens = Tokenizer.getTokensDeepFirst(simpleTree).trim(); String[] tokensArray = tokens.split(" "); - if (tokensArray.length <= Configuration.MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE) { + + if (tokensArray.length <= maxSize) { int position = tree.getPos(); int lineNum = unit.getLineNumber(position); tokensBuilder.append(tokens).append("\n"); @@ -183,4 +183,5 @@ public class ProjectScanner { simpleTree.setParent(parent); return simpleTree; } + }