Fix the StringIndexOutOfBoundsException int Tokenizer.

2017-08-02 13:40:43 +02:00
parent 13ed0c16f4
commit 1c7d58c17c
5 changed files with 81 additions and 53 deletions
@@ -162,10 +162,10 @@ public class Parser {
 					String semiSourceCodeEditScripts = getSemiSourceCodeEditScripts(actionSet);
 					
 					
-					this.buggyTrees += Configuration.BUGGY_TREE_TOKEN + "\n" + simpleTree.toString() + "\n";
+					this.buggyTrees += Configuration.BUGGY_TREE_SIGNAL + "\n" + simpleTree.toString() + "\n";
 					this.tokensOfSourceCode += getTokensDeepFirst(simpleTree).trim() + "\n";
-					this.actionSets += Configuration.BUGGY_TREE_TOKEN + "\n" + readActionSet(actionSet, "") + "\n";
-					this.originalTree += Configuration.BUGGY_TREE_TOKEN + "\n" + actionSet.getOriginalTree().toString() + "\n";
+					this.actionSets += Configuration.BUGGY_TREE_SIGNAL + "\n" + readActionSet(actionSet, "") + "\n";
+					this.originalTree += Configuration.BUGGY_TREE_SIGNAL + "\n" + actionSet.getOriginalTree().toString() + "\n";
 					
 //					// Source Code of patches.
 //					String patchSourceCode = getPatchSourceCode(sourceCode, startLineNum, endLineNum, startLineNum2,
@@ -142,7 +142,7 @@ public class SingleStatementParser {
 					String astEditScripts = getASTEditScripts(actionSet);
 					int size = astEditScripts.split(" ").length;
 					if (size == 1) {
-						System.out.println(actionSet);
+//						System.out.println(actionSet);
 						continue;
 					}
 					
@@ -150,7 +150,7 @@ public class SingleStatementParser {
 					String patchSourceCode = getPatchSourceCode(hunk, startLine, endLine, startLine2, endLine2);
 					if ("".equals(patchSourceCode)) continue;

-					this.patchesSourceCode += Configuration.PATCH_TOKEN + "\n" + patchSourceCode + "\n";
+					this.patchesSourceCode += Configuration.PATCH_SIGNAL + "\n" + patchSourceCode + "\n";
 					this.sizes += size + "\n";
 					this.astEditScripts += astEditScripts + "\n";
 					// 2. source code: raw tokens
@@ -10,30 +10,17 @@ public class Tokenizer {
 		String tokens = "";
 		List<SimpleTree> children = simpleTree.getChildren();
 		String astNodeType = simpleTree.getNodeType();
-		if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
-				|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
-				|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType) 
-				|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
-				|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
-			String label = simpleTree.getLabel();
-			label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
-			tokens += label + " ";
-		} else if ("EnhancedForStatement".equals(astNodeType)) {
-			tokens += "for ";
-		} else if ("CatchClause".equals(astNodeType)) {
-			tokens += "catch ";
-		} else if ("SwitchCase".equals(astNodeType)) {
-			tokens += "case ";
-		} else if ("SuperConstructorInvocation".equals(astNodeType)) {
-			tokens += "super ";
-		} else if ("ConstructorInvocation".equals(astNodeType)) {
-			tokens += "this ";
-		} else if ("FinallyBody".equals(astNodeType)) {
-			tokens += "finally ";
-		}
-
-		if (children.isEmpty()) {
-			if ("StringLiteral".equals(astNodeType)) {
+		
+		if (children.isEmpty()) { // BreakStatement, ContinueStatement, ReturnStatement, TryStatement
+			if (astNodeType.endsWith("Statement")) {
+				String label = astNodeType;
+				label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
+				tokens += astNodeType + " " + label + " ";
+			} else if ("SuperConstructorInvocation".equals(astNodeType)) {
+				tokens += astNodeType + " super ";
+			} else if ("ConstructorInvocation".equals(astNodeType)) {
+				tokens += astNodeType + " this ";
+			} else if ("StringLiteral".equals(astNodeType)) {
 				tokens += astNodeType + " stringLiteral ";
 			} else if ("CharacterLiteral".equals(astNodeType)) {
 				tokens += astNodeType + " charLiteral ";
@@ -43,6 +30,30 @@ public class Tokenizer {
 				tokens += astNodeType + " " + simpleTree.getLabel() + " ";
 			}
 		} else {
+			if ("AssertStatement".equals(astNodeType) || "DoStatement".equals(astNodeType)
+					|| "ForStatement".equals(astNodeType) || "IfStatement".equals(astNodeType)
+					|| "ReturnStatement".equals(astNodeType) || "SwitchStatement".equals(astNodeType) 
+					|| "SynchronizedStatement".equals(astNodeType) || "ThrowStatement".equals(astNodeType)
+					|| "TryStatement".equals(astNodeType) || "WhileStatement".equals(astNodeType)) {
+				String label = astNodeType;
+				label = label.substring(0, label.lastIndexOf("S")).toLowerCase();
+				tokens += astNodeType + " " + label + " ";
+			} else if ("EnhancedForStatement".equals(astNodeType)) {
+				tokens += astNodeType + " " + "for ";
+			} else if ("CatchClause".equals(astNodeType)) {
+				tokens += astNodeType + " " + "catch ";
+			} else if ("SwitchCase".equals(astNodeType)) {
+				tokens += astNodeType + " case ";
+			} else if ("SuperConstructorInvocation".equals(astNodeType)) {
+				tokens += astNodeType + " super ";
+			} else if ("ConstructorInvocation".equals(astNodeType)) {
+				tokens += astNodeType + " this ";
+			} else if ("FinallyBody".equals(astNodeType)) {
+				tokens += astNodeType + " finally ";
+			} else if ("LabeledStatement".equals(astNodeType)) {
+				tokens += "LabeledStatement " + simpleTree.getLabel();
+			}
+			
 			if ("ArrayInitializer".equals(astNodeType)) {
 				tokens += astNodeType + " arrayInitializer ";
 			} else {
@@ -1,18 +1,14 @@
 package edu.lu.uni.serval.config;

 public class Configuration {
-	private static final String ROOT_PATH = "../";
+	private static final String ROOT_PATH = "../";  // The root path of all output data.

-	public static final int HUNK_SIZE = 7;
-	public static final String BUGGY_TREE_TOKEN = "BUGGY_TREE###";
-	public static final String PATCH_TOKEN = "PATCH###";
-	
-	public static int MAXZ_SIZE = 0;
-	public static int TOKEN_VECTOR_SIZE = 0;
-	public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN = 100;
+	public static final int HUNK_SIZE = 7; // The limitation of source code lines of each DiffEntry, which will be selected as training data.
+	public static final String BUGGY_TREE_SIGNAL = "BUGGY_TREE###"; // The starting signal of the tree of buggy source code .
+	public static final String PATCH_SIGNAL = "PATCH###"; // The starting signal of each patch.
 	
 	// input path of GumTree. (i.e., Fix patterns parser)
-	public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";
+	public static final String GUM_TREE_INPUT = ROOT_PATH + "GumTreeInput/";// Buggy version file VS. Fixing version file, (DiffEntry File)
 	
 	// the output path of GumTree results.
 	private static final String GUM_TREE_OUTPUT = ROOT_PATH + "GumTreeResults/";
@@ -28,28 +24,49 @@ public class Configuration {
 	public static final String BUGGY_CODY_TOKENS_FILE = GUM_TREE_OUTPUT + "tokens.list";
 	public static final String EDITSCRIPT_SIZES_FILE = GUM_TREE_OUTPUT + "editScriptSizes.list";
 	
+	public static int MAX_EDIT_SCRIPT_VECTOR_SIZE = 0; // The max size of edit script vectors.
+	public static int MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE = 0; // The max size of all buggy source code token vectors.
+	public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN1 = 100; // tokens of edit scripts.
+	public static final int VECTOR_SIZE_OF_EMBEDED_TOKEN2 = 200; // tokens of source code
+	
 	// the input path of fix patterns mining.
 	private static final String MINING_INPUT = ROOT_PATH + "MiningInput/";
 	// the input path of token embedding.
 	public static final String EMBEDDING_INPUT = MINING_INPUT + "Embedding/";
-	public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";
+	public static final String SELECTED_PATCHES_SOURE_CODE_FILE = EMBEDDING_INPUT + "patchSourceCode.list";// Selected patches.
 	public static final String SELECTED_BUGGY_TREE_FILE = EMBEDDING_INPUT + "buggyTrees.list";
-	public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list";
-	public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list";
+	public static final String SELECTED_BUGGY_TOKEN_FILE = EMBEDDING_INPUT + "tokens.list"; // Selected token vectors of buggy source code.
+	public static final String SELECTED_EDITSCRIPTES_FILE = EMBEDDING_INPUT + "editScripts.list"; // Selected edit script vectors.
 	// the input path of feature learning.
 	public static final String FEATURE_LEARNING_INPUT = MINING_INPUT + "FeatureLearning/";
-	public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list";
-	public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv";
+	public static final String EMBEDDED_EDIT_SCRIPT_TOKENS = FEATURE_LEARNING_INPUT + "embeddedEditScriptTokens.list"; // All embedded tokens of selected edit scripts.
+	public static final String VECTORIED_EDIT_SCRIPTS = FEATURE_LEARNING_INPUT + "vectorizedEditScripts.csv"; // Embedded and vectorized edit script vectors.
 	// the input path of clustering.
-	public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/";
+	public static final String EXTRACTED_FEATURES = MINING_INPUT + "ExtractedFeatures/"; // Extracted features of edit scripts.
 	public static final String CLUSTER_INPUT = MINING_INPUT + "ClusteringInput/input.arff";

 	// the output path of fix patterns mining.
 	private static final String MINING_OUTPUT = ROOT_PATH + "MiningOutput/";
 	public static final String CLUSTER_OUTPUT = MINING_OUTPUT + "ClusteringOutput/clusterResults.list";
-
+	public static final String CLUSTERED_PATCHES_FILE = MINING_OUTPUT + "ClusteredPatches/";
+	public static final String CLUSTERED_TOKENSS_FILE = MINING_OUTPUT + "ClusteredTokens/"; // Token vectors of buggy source code.
+	
 	// evaluation data
 	public static final String TEST_INPUT = ROOT_PATH + "TestProjects/";
-	public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list";
-	public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list";
+	public static final String TEST_LOCALIZATION_FILE = ROOT_PATH + "TestData/Localization.list"; // Positions of all test statements.
+	public static final String TEST_DATA_FILE = ROOT_PATH + "TestData/TestStatements.list"; // Token vectors of all test statements.
+	
+	// data of unsupervised learning
+	public static final String EMBEDDING_DATA_TOKENS1 = ROOT_PATH + "TestData/AllTokenVectorsForEvaluation.list";
+	public static final String EMBEDDED_ALL_TOKENS1 = ROOT_PATH + "TestData/AllEmbeddedTokens.list";
+	public static final String VECTORIED_ALL_SOURCE_CODE1 = ROOT_PATH + "TestData/AllVectorizedSourceCode.list";
+	public static final String EXTRACTED_FEATURES_TESTING = ROOT_PATH + "TestDataExtractedFeatures/";
+	
+	// Data of supervised learning
+	public static final String CLUSTERNUMBER_LABEL_MAP = ROOT_PATH + "TestData/clusterMappingLabel.list";
+	public static final String EMBEDDING_DATA_TOKENS2 = ROOT_PATH + "TestData/AllTokenVectorsForSupervisedEvaluation.list";
+	public static final String EMBEDDED_ALL_TOKENS2 = ROOT_PATH + "TestData/AllEmbeddedTokensForSuperVisedEvaluation.list";
+	public static final String TRAINING_DATA = ROOT_PATH + "TestData/TrainingData.csv"; // Training data of supervised learning
+	public static final String TESTING_DATA = ROOT_PATH + "TestData/TestingData.csv";   // testing data of supervised learning
+	
 }
@@ -38,15 +38,15 @@ public class ProjectScanner {
 		
 		for (File project : projects) {
 			ProjectScanner scanner = new ProjectScanner();
-			scanner.scanJavaProject(project.getPath(), outputLocalizeFile, outputTokensFile);
+			scanner.scanJavaProject(project, outputLocalizeFile, outputTokensFile);
 		}
 	}

 	List<SimpleTree> allSimpleTrees = new ArrayList<>();
 	
-	public void scanJavaProject(String javaProject, String outputLocalizeFile, String outputTokensFile) {
+	public void scanJavaProject(File javaProject, String outputLocalizeFile, String outputTokensFile) {
 		List<File> files = new ArrayList<>();
-		files.addAll(FileHelper.getAllFiles(javaProject, ".java"));
+		files.addAll(FileHelper.getAllFiles(javaProject.getPath(), ".java"));
 		
 		StringBuilder tokensBuilder = new StringBuilder();
 		StringBuilder localizationsBuilder = new StringBuilder();
@@ -59,7 +59,7 @@ public class ProjectScanner {
 			
 			CUCreator cuCreator = new CUCreator();
 			CompilationUnit cUnit = cuCreator.createCompilationUnit(file);
-			getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject, file.getPath());
+			getTokenVectorOfAllStatements(tree, cUnit, tokensBuilder, localizationsBuilder, javaProject.getPath(), file.getPath());
 		
 			if (++ counter % 1000 == 0) {
 				FileHelper.outputToFile(outputLocalizeFile, localizationsBuilder, true);
@@ -75,7 +75,7 @@ public class ProjectScanner {
 		tokensBuilder.setLength(0);
 	}
 	
-	public void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
+	private void getTokenVectorOfAllStatements(ITree tree, CompilationUnit unit, StringBuilder tokensBuilder, StringBuilder localizationsBuilder, String projectName, String filePath) {
 		String astNodeType = ASTNodeMap.map.get(tree.getType()); //ignore: SwitchCase, SuperConstructorInvocation, ConstructorInvocation
 		if ((astNodeType.endsWith("Statement") && !astNodeType.equals("TypeDeclarationStatement"))
 				|| astNodeType.equals("FieldDeclaration")) {
@@ -98,7 +98,7 @@ public class ProjectScanner {
 				// project name: file name: line number
 				String tokens = Tokenizer.getTokensDeepFirst(simpleTree).trim();
 				String[] tokensArray = tokens.split(" ");
-				if (tokensArray.length <= Configuration.TOKEN_VECTOR_SIZE) {
+				if (tokensArray.length <= Configuration.MAX_SOURCE_CODE_TOKEN_VECTOR_SIZE) {
 					int position = tree.getPos();
 					int lineNum = unit.getLineNumber(position);
 					tokensBuilder.append(tokens).append("\n");