Fix the StringIndexOutOfBoundsException int Tokenizer.

This commit is contained in:
Kui LIU
2017-08-02 13:40:43 +02:00
parent 13ed0c16f4
commit 1c7d58c17c
5 changed files with 81 additions and 53 deletions
@@ -162,10 +162,10 @@ public class Parser {
String semiSourceCodeEditScripts = getSemiSourceCodeEditScripts(actionSet);
this.buggyTrees += Configuration.BUGGY_TREE_TOKEN + "\n" + simpleTree.toString() + "\n";
this.buggyTrees += Configuration.BUGGY_TREE_SIGNAL + "\n" + simpleTree.toString() + "\n";
this.tokensOfSourceCode += getTokensDeepFirst(simpleTree).trim() + "\n";
this.actionSets += Configuration.BUGGY_TREE_TOKEN + "\n" + readActionSet(actionSet, "") + "\n";
this.originalTree += Configuration.BUGGY_TREE_TOKEN + "\n" + actionSet.getOriginalTree().toString() + "\n";
this.actionSets += Configuration.BUGGY_TREE_SIGNAL + "\n" + readActionSet(actionSet, "") + "\n";
this.originalTree += Configuration.BUGGY_TREE_SIGNAL + "\n" + actionSet.getOriginalTree().toString() + "\n";
// // Source Code of patches.
// String patchSourceCode = getPatchSourceCode(sourceCode, startLineNum, endLineNum, startLineNum2,
@@ -142,7 +142,7 @@ public class SingleStatementParser {
String astEditScripts = getASTEditScripts(actionSet);
int size = astEditScripts.split(" ").length;
if (size == 1) {
System.out.println(actionSet);
// System.out.println(actionSet);
continue;
}
@@ -150,7 +150,7 @@ public class SingleStatementParser {
String patchSourceCode = getPatchSourceCode(hunk, startLine, endLine, startLine2, endLine2);
if ("".equals(patchSourceCode)) continue;
this.patchesSourceCode += Configuration.PATCH_TOKEN + "\n" + patchSourceCode + "\n";
this.patchesSourceCode += Configuration.PATCH_SIGNAL + "\n" + patchSourceCode + "\n";
this.sizes += size + "\n";
this.astEditScripts += astEditScripts + "\n";
// 2. source code: raw tokens
@@ -10,30 +10,17 @@ public class Tokenizer {
String tokens = "";
List<SimpleTree> children = simpleTree.getChildren();
String astNodeType = simpleTree.getNodeType();
if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType)
|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
String label = simpleTree.getLabel();
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
tokens += label + " ";
} else if ("EnhancedForStatement".equals(astNodeType)) {
tokens += "for ";
} else if ("CatchClause".equals(astNodeType)) {
tokens += "catch ";
} else if ("SwitchCase".equals(astNodeType)) {
tokens += "case ";
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
tokens += "super ";
} else if ("ConstructorInvocation".equals(astNodeType)) {
tokens += "this ";
} else if ("FinallyBody".equals(astNodeType)) {
tokens += "finally ";
}
if (children.isEmpty()) {
if ("StringLiteral".equals(astNodeType)) {
if (children.isEmpty()) { // BreakStatement, ContinueStatement, ReturnStatement, TryStatement
if (astNodeType.endsWith("Statement")) {
String label = astNodeType;
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
tokens += astNodeType + " " + label + " ";
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
tokens += astNodeType + " super ";
} else if ("ConstructorInvocation".equals(astNodeType)) {
tokens += astNodeType + " this ";
} else if ("StringLiteral".equals(astNodeType)) {
tokens += astNodeType + " stringLiteral ";
} else if ("CharacterLiteral".equals(astNodeType)) {
tokens += astNodeType + " charLiteral ";
@@ -43,6 +30,30 @@ public class Tokenizer {
tokens += astNodeType + " " + simpleTree.getLabel() + " ";
}
} else {
if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType)
|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
String label = astNodeType;
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
tokens += astNodeType + " " + label + " ";
} else if ("EnhancedForStatement".equals(astNodeType)) {
tokens += astNodeType + " " + "for ";
} else if ("CatchClause".equals(astNodeType)) {
tokens += astNodeType + " " + "catch ";
} else if ("SwitchCase".equals(astNodeType)) {
tokens += astNodeType + " case ";
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
tokens += astNodeType + " super ";
} else if ("ConstructorInvocation".equals(astNodeType)) {
tokens += astNodeType + " this ";
} else if ("FinallyBody".equals(astNodeType)) {
tokens += astNodeType + " finally ";
} else if ("LabeledStatement".equals(astNodeType)) {
tokens += "LabeledStatement " + simpleTree.getLabel();
}
if ("ArrayInitializer".equals(astNodeType)) {
tokens += astNodeType + " arrayInitializer ";
} else {
@@ -1,18 +1,14 @@
package edu.lu.uni.serval.config;
public class Configuration {
private static final String ROOT_PATH = "../";
private static final String ROOT_PATH = "../"; // The root path of all output data.
public static final int HUNK_SIZE = 7;
public static final String BUGGY_TREE_TOKEN = "BUGGY_TREE###";
public static final String PATCH_TOKEN = "PATCH###";
public static int MAXZ_SIZE = 0;
public static int TOKEN_VECTOR_SIZE = 0;
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN = 100;
public static final int HUNK_SIZE = 7; // The limitation of source code lines of each DiffEntry, which will be selected as training data.
public static final String BUGGY_TREE_SIGNAL = "BUGGY_TREE###"; // The starting signal of the tree of buggy source code .
public static final String PATCH_SIGNAL = "PATCH###"; // The starting signal of each patch.
// input path of GumTree. (i.e., Fix patterns parser)
public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";
public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";// Buggy version file VS. Fixing version file, (DiffEntry File)
// the output path of GumTree results.
private static final String GUM_TREE_OUTPUT = ROOT_PATH + "GumTreeResults/";
@@ -28,28 +24,49 @@ public class Configuration {
public static final String BUGGY_CODY_TOKENS_FILE = GUM_TREE_OUTPUT + "tokens.list";
public static final String EDITSCRIPT_SIZES_FILE = GUM_TREE_OUTPUT + "editScriptSizes.list";
public static int MAX_EDIT_SCRIPT_VECTOR_SIZE = 0; // The max size of edit script vectors.
public static int MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE = 0; // The max size of all buggy source code token vectors.
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN1 = 100; // tokens of edit scripts.
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN2 = 200; // tokens of source code
// the input path of fix patterns mining.
private static final String MINING_INPUT = ROOT_PATH + "MiningInput/";
// the input path of token embedding.
public static final String EMBEDDING_INPUT = MINING_INPUT + "Embedding/";
public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";
public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";// Selected patches.
public static final String SELECTED_BUGGY_TREE_FILE = EMBEDDING_INPUT + "buggyTrees.list";
public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list";
public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list";
public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list"; // Selected token vectors of buggy source code.
public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list"; // Selected edit script vectors.
// the input path of feature learning.
public static final String FEATURE_LEARNING_INPUT = MINING_INPUT + "FeatureLearning/";
public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list";
public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv";
public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list"; // All embedded tokens of selected edit scripts.
public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv"; // Embedded and vectorized edit script vectors.
// the input path of clustering.
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/";
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of edit scripts.
public static final String CLUSTER_INPUT = MINING_INPUT + "ClusteringInput/input.arff";
// the output path of fix patterns mining.
private static final String MINING_OUTPUT = ROOT_PATH + "MiningOutput/";
public static final String CLUSTER_OUTPUT = MINING_OUTPUT + "ClusteringOutput/clusterResults.list";
public static final String CLUSTERED_PATCHES_FILE = MINING_OUTPUT + "ClusteredPatches/";
public static final String CLUSTERED_TOKENSS_FILE = MINING_OUTPUT + "ClusteredTokens/"; // Token vectors of buggy source code.
// evaluation data
public static final String TEST_INPUT = ROOT_PATH + "TestProjects/";
public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list";
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list";
public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list"; // Positions of all test statements.
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list"; // Token vectors of all test statements.
// data of unsupervised learning
public static final String EMBEDDING_DATA_TOKENS1 = ROOT_PATH + "TestData/AllTokenVectorsForEvaluation.list";
public static final String EMBEDDED_ALL_TOKENS1 = ROOT_PATH + "TestData/AllEmbeddedTokens.list";
public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode.list";
public static final String EXTRACTED_FEATURES_TESTING = ROOT_PATH + "TestDataExtractedFeatures/";
// Data of supervised learning
public static final String CLUSTERNUMBER_LABEL_MAP = ROOT_PATH + "TestData/clusterMappingLabel.list";
public static final String EMBEDDING_DATA_TOKENS2 = ROOT_PATH + "TestData/AllTokenVectorsForSupervisedEvaluation.list";
public static final String EMBEDDED_ALL_TOKENS2 = ROOT_PATH + "TestData/AllEmbeddedTokensForSuperVisedEvaluation.list";
public static final String TRAINING_DATA = ROOT_PATH + "TestData/TrainingData.csv"; // Training data of supervised learning
public static final String TESTING_DATA = ROOT_PATH + "TestData/TestingData.csv"; // testing data of supervised learning
}
@@ -38,15 +38,15 @@ public class ProjectScanner {
for (File project : projects) {
ProjectScanner scanner = new ProjectScanner();
scanner.scanJavaProject(project.getPath(), outputLocalizeFile, outputTokensFile);
scanner.scanJavaProject(project, outputLocalizeFile, outputTokensFile);
}
}
List<SimpleTree> allSimpleTrees = new ArrayList<>();
public void scanJavaProject(String javaProject, String outputLocalizeFile, String outputTokensFile) {
public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile) {
List<File> files = new ArrayList<>();
files.addAll(FileHelper.getAllFiles(javaProject, ".java"));
files.addAll(FileHelper.getAllFiles(javaProject.getPath(), ".java"));
StringBuilder tokensBuilder = new StringBuilder();
StringBuilder localizationsBuilder = new StringBuilder();
@@ -59,7 +59,7 @@ public class ProjectScanner {
CUCreator cuCreator = new CUCreator();
CompilationUnit cUnit = cuCreator.createCompilationUnit(file);
getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject, file.getPath());
getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject.getPath(), file.getPath());
if (++ counter % 1000 == 0) {
FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true);
@@ -75,7 +75,7 @@ public class ProjectScanner {
tokensBuilder.setLength(0);
}
public void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
private void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
String astNodeType = ASTNodeMap.map.get(tree.getType()); //ignore: SwitchCase, SuperConstructorInvocation, ConstructorInvocation
if ((astNodeType.endsWith("Statement") && !astNodeType.equals("TypeDeclarationStatement"))
|| astNodeType.equals("FieldDeclaration")) {
@@ -98,7 +98,7 @@ public class ProjectScanner {
// project name: file name: line number
String tokens = Tokenizer.getTokensDeepFirst(simpleTree).trim();
String[] tokensArray = tokens.split(" ");
if (tokensArray.length <= Configuration.TOKEN_VECTOR_SIZE) {
if (tokensArray.length <= Configuration.MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE) {
int position = tree.getPos();
int lineNum = unit.getLineNumber(position);
tokensBuilder.append(tokens).append("\n");