Fix the StringIndexOutOfBoundsException int Tokenizer.
This commit is contained in:
@@ -162,10 +162,10 @@ public class Parser {
|
||||
String semiSourceCodeEditScripts = getSemiSourceCodeEditScripts(actionSet);
|
||||
|
||||
|
||||
this.buggyTrees += Configuration.BUGGY_TREE_TOKEN + "\n" + simpleTree.toString() + "\n";
|
||||
this.buggyTrees += Configuration.BUGGY_TREE_SIGNAL + "\n" + simpleTree.toString() + "\n";
|
||||
this.tokensOfSourceCode += getTokensDeepFirst(simpleTree).trim() + "\n";
|
||||
this.actionSets += Configuration.BUGGY_TREE_TOKEN + "\n" + readActionSet(actionSet, "") + "\n";
|
||||
this.originalTree += Configuration.BUGGY_TREE_TOKEN + "\n" + actionSet.getOriginalTree().toString() + "\n";
|
||||
this.actionSets += Configuration.BUGGY_TREE_SIGNAL + "\n" + readActionSet(actionSet, "") + "\n";
|
||||
this.originalTree += Configuration.BUGGY_TREE_SIGNAL + "\n" + actionSet.getOriginalTree().toString() + "\n";
|
||||
|
||||
// // Source Code of patches.
|
||||
// String patchSourceCode = getPatchSourceCode(sourceCode, startLineNum, endLineNum, startLineNum2,
|
||||
|
||||
@@ -142,7 +142,7 @@ public class SingleStatementParser {
|
||||
String astEditScripts = getASTEditScripts(actionSet);
|
||||
int size = astEditScripts.split(" ").length;
|
||||
if (size == 1) {
|
||||
System.out.println(actionSet);
|
||||
// System.out.println(actionSet);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ public class SingleStatementParser {
|
||||
String patchSourceCode = getPatchSourceCode(hunk, startLine, endLine, startLine2, endLine2);
|
||||
if ("".equals(patchSourceCode)) continue;
|
||||
|
||||
this.patchesSourceCode += Configuration.PATCH_TOKEN + "\n" + patchSourceCode + "\n";
|
||||
this.patchesSourceCode += Configuration.PATCH_SIGNAL + "\n" + patchSourceCode + "\n";
|
||||
this.sizes += size + "\n";
|
||||
this.astEditScripts += astEditScripts + "\n";
|
||||
// 2. source code: raw tokens
|
||||
|
||||
@@ -10,30 +10,17 @@ public class Tokenizer {
|
||||
String tokens = "";
|
||||
List<SimpleTree> children = simpleTree.getChildren();
|
||||
String astNodeType = simpleTree.getNodeType();
|
||||
if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
|
||||
|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
|
||||
|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType)
|
||||
|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
|
||||
|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
|
||||
String label = simpleTree.getLabel();
|
||||
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
|
||||
tokens += label + " ";
|
||||
} else if ("EnhancedForStatement".equals(astNodeType)) {
|
||||
tokens += "for ";
|
||||
} else if ("CatchClause".equals(astNodeType)) {
|
||||
tokens += "catch ";
|
||||
} else if ("SwitchCase".equals(astNodeType)) {
|
||||
tokens += "case ";
|
||||
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += "super ";
|
||||
} else if ("ConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += "this ";
|
||||
} else if ("FinallyBody".equals(astNodeType)) {
|
||||
tokens += "finally ";
|
||||
}
|
||||
|
||||
if (children.isEmpty()) {
|
||||
if ("StringLiteral".equals(astNodeType)) {
|
||||
|
||||
if (children.isEmpty()) { // BreakStatement, ContinueStatement, ReturnStatement, TryStatement
|
||||
if (astNodeType.endsWith("Statement")) {
|
||||
String label = astNodeType;
|
||||
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
|
||||
tokens += astNodeType + " " + label + " ";
|
||||
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += astNodeType + " super ";
|
||||
} else if ("ConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += astNodeType + " this ";
|
||||
} else if ("StringLiteral".equals(astNodeType)) {
|
||||
tokens += astNodeType + " stringLiteral ";
|
||||
} else if ("CharacterLiteral".equals(astNodeType)) {
|
||||
tokens += astNodeType + " charLiteral ";
|
||||
@@ -43,6 +30,30 @@ public class Tokenizer {
|
||||
tokens += astNodeType + " " + simpleTree.getLabel() + " ";
|
||||
}
|
||||
} else {
|
||||
if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
|
||||
|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
|
||||
|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType)
|
||||
|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
|
||||
|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
|
||||
String label = astNodeType;
|
||||
label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
|
||||
tokens += astNodeType + " " + label + " ";
|
||||
} else if ("EnhancedForStatement".equals(astNodeType)) {
|
||||
tokens += astNodeType + " " + "for ";
|
||||
} else if ("CatchClause".equals(astNodeType)) {
|
||||
tokens += astNodeType + " " + "catch ";
|
||||
} else if ("SwitchCase".equals(astNodeType)) {
|
||||
tokens += astNodeType + " case ";
|
||||
} else if ("SuperConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += astNodeType + " super ";
|
||||
} else if ("ConstructorInvocation".equals(astNodeType)) {
|
||||
tokens += astNodeType + " this ";
|
||||
} else if ("FinallyBody".equals(astNodeType)) {
|
||||
tokens += astNodeType + " finally ";
|
||||
} else if ("LabeledStatement".equals(astNodeType)) {
|
||||
tokens += "LabeledStatement " + simpleTree.getLabel();
|
||||
}
|
||||
|
||||
if ("ArrayInitializer".equals(astNodeType)) {
|
||||
tokens += astNodeType + " arrayInitializer ";
|
||||
} else {
|
||||
|
||||
@@ -1,18 +1,14 @@
|
||||
package edu.lu.uni.serval.config;
|
||||
|
||||
public class Configuration {
|
||||
private static final String ROOT_PATH = "../";
|
||||
private static final String ROOT_PATH = "../"; // The root path of all output data.
|
||||
|
||||
public static final int HUNK_SIZE = 7;
|
||||
public static final String BUGGY_TREE_TOKEN = "BUGGY_TREE###";
|
||||
public static final String PATCH_TOKEN = "PATCH###";
|
||||
|
||||
public static int MAXZ_SIZE = 0;
|
||||
public static int TOKEN_VECTOR_SIZE = 0;
|
||||
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN = 100;
|
||||
public static final int HUNK_SIZE = 7; // The limitation of source code lines of each DiffEntry, which will be selected as training data.
|
||||
public static final String BUGGY_TREE_SIGNAL = "BUGGY_TREE###"; // The starting signal of the tree of buggy source code .
|
||||
public static final String PATCH_SIGNAL = "PATCH###"; // The starting signal of each patch.
|
||||
|
||||
// input path of GumTree. (i.e., Fix patterns parser)
|
||||
public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";
|
||||
public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";// Buggy version file VS. Fixing version file, (DiffEntry File)
|
||||
|
||||
// the output path of GumTree results.
|
||||
private static final String GUM_TREE_OUTPUT = ROOT_PATH + "GumTreeResults/";
|
||||
@@ -28,28 +24,49 @@ public class Configuration {
|
||||
public static final String BUGGY_CODY_TOKENS_FILE = GUM_TREE_OUTPUT + "tokens.list";
|
||||
public static final String EDITSCRIPT_SIZES_FILE = GUM_TREE_OUTPUT + "editScriptSizes.list";
|
||||
|
||||
public static int MAX_EDIT_SCRIPT_VECTOR_SIZE = 0; // The max size of edit script vectors.
|
||||
public static int MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE = 0; // The max size of all buggy source code token vectors.
|
||||
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN1 = 100; // tokens of edit scripts.
|
||||
public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN2 = 200; // tokens of source code
|
||||
|
||||
// the input path of fix patterns mining.
|
||||
private static final String MINING_INPUT = ROOT_PATH + "MiningInput/";
|
||||
// the input path of token embedding.
|
||||
public static final String EMBEDDING_INPUT = MINING_INPUT + "Embedding/";
|
||||
public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";
|
||||
public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";// Selected patches.
|
||||
public static final String SELECTED_BUGGY_TREE_FILE = EMBEDDING_INPUT + "buggyTrees.list";
|
||||
public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list";
|
||||
public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list";
|
||||
public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list"; // Selected token vectors of buggy source code.
|
||||
public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list"; // Selected edit script vectors.
|
||||
// the input path of feature learning.
|
||||
public static final String FEATURE_LEARNING_INPUT = MINING_INPUT + "FeatureLearning/";
|
||||
public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list";
|
||||
public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv";
|
||||
public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list"; // All embedded tokens of selected edit scripts.
|
||||
public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv"; // Embedded and vectorized edit script vectors.
|
||||
// the input path of clustering.
|
||||
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/";
|
||||
public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of edit scripts.
|
||||
public static final String CLUSTER_INPUT = MINING_INPUT + "ClusteringInput/input.arff";
|
||||
|
||||
// the output path of fix patterns mining.
|
||||
private static final String MINING_OUTPUT = ROOT_PATH + "MiningOutput/";
|
||||
public static final String CLUSTER_OUTPUT = MINING_OUTPUT + "ClusteringOutput/clusterResults.list";
|
||||
|
||||
public static final String CLUSTERED_PATCHES_FILE = MINING_OUTPUT + "ClusteredPatches/";
|
||||
public static final String CLUSTERED_TOKENSS_FILE = MINING_OUTPUT + "ClusteredTokens/"; // Token vectors of buggy source code.
|
||||
|
||||
// evaluation data
|
||||
public static final String TEST_INPUT = ROOT_PATH + "TestProjects/";
|
||||
public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list";
|
||||
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list";
|
||||
public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list"; // Positions of all test statements.
|
||||
public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list"; // Token vectors of all test statements.
|
||||
|
||||
// data of unsupervised learning
|
||||
public static final String EMBEDDING_DATA_TOKENS1 = ROOT_PATH + "TestData/AllTokenVectorsForEvaluation.list";
|
||||
public static final String EMBEDDED_ALL_TOKENS1 = ROOT_PATH + "TestData/AllEmbeddedTokens.list";
|
||||
public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode.list";
|
||||
public static final String EXTRACTED_FEATURES_TESTING = ROOT_PATH + "TestDataExtractedFeatures/";
|
||||
|
||||
// Data of supervised learning
|
||||
public static final String CLUSTERNUMBER_LABEL_MAP = ROOT_PATH + "TestData/clusterMappingLabel.list";
|
||||
public static final String EMBEDDING_DATA_TOKENS2 = ROOT_PATH + "TestData/AllTokenVectorsForSupervisedEvaluation.list";
|
||||
public static final String EMBEDDED_ALL_TOKENS2 = ROOT_PATH + "TestData/AllEmbeddedTokensForSuperVisedEvaluation.list";
|
||||
public static final String TRAINING_DATA = ROOT_PATH + "TestData/TrainingData.csv"; // Training data of supervised learning
|
||||
public static final String TESTING_DATA = ROOT_PATH + "TestData/TestingData.csv"; // testing data of supervised learning
|
||||
|
||||
}
|
||||
|
||||
@@ -38,15 +38,15 @@ public class ProjectScanner {
|
||||
|
||||
for (File project : projects) {
|
||||
ProjectScanner scanner = new ProjectScanner();
|
||||
scanner.scanJavaProject(project.getPath(), outputLocalizeFile, outputTokensFile);
|
||||
scanner.scanJavaProject(project, outputLocalizeFile, outputTokensFile);
|
||||
}
|
||||
}
|
||||
|
||||
List<SimpleTree> allSimpleTrees = new ArrayList<>();
|
||||
|
||||
public void scanJavaProject(String javaProject, String outputLocalizeFile, String outputTokensFile) {
|
||||
public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile) {
|
||||
List<File> files = new ArrayList<>();
|
||||
files.addAll(FileHelper.getAllFiles(javaProject, ".java"));
|
||||
files.addAll(FileHelper.getAllFiles(javaProject.getPath(), ".java"));
|
||||
|
||||
StringBuilder tokensBuilder = new StringBuilder();
|
||||
StringBuilder localizationsBuilder = new StringBuilder();
|
||||
@@ -59,7 +59,7 @@ public class ProjectScanner {
|
||||
|
||||
CUCreator cuCreator = new CUCreator();
|
||||
CompilationUnit cUnit = cuCreator.createCompilationUnit(file);
|
||||
getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject, file.getPath());
|
||||
getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject.getPath(), file.getPath());
|
||||
|
||||
if (++ counter % 1000 == 0) {
|
||||
FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true);
|
||||
@@ -75,7 +75,7 @@ public class ProjectScanner {
|
||||
tokensBuilder.setLength(0);
|
||||
}
|
||||
|
||||
public void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
|
||||
private void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
|
||||
String astNodeType = ASTNodeMap.map.get(tree.getType()); //ignore: SwitchCase, SuperConstructorInvocation, ConstructorInvocation
|
||||
if ((astNodeType.endsWith("Statement") && !astNodeType.equals("TypeDeclarationStatement"))
|
||||
|| astNodeType.equals("FieldDeclaration")) {
|
||||
@@ -98,7 +98,7 @@ public class ProjectScanner {
|
||||
// project name: file name: line number
|
||||
String tokens = Tokenizer.getTokensDeepFirst(simpleTree).trim();
|
||||
String[] tokensArray = tokens.split(" ");
|
||||
if (tokensArray.length <= Configuration.TOKEN_VECTOR_SIZE) {
|
||||
if (tokensArray.length <= Configuration.MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE) {
|
||||
int position = tree.getPos();
|
||||
int lineNum = unit.getLineNumber(position);
|
||||
tokensBuilder.append(tokens).append("\n");
|
||||
|
||||
Reference in New Issue
Block a user