Files
fixminer_source/python/tokens.py
T
2020-04-06 21:30:39 +02:00

138 lines
6.2 KiB
Python

from common.commons import *
DATA_PATH = os.environ["DATA_PATH"]
ast = ["AnonymousClassDeclaration", "ArrayAccess", "ArrayCreation", "ArrayInitializer", "ArrayType", "AssertStatement",
"Assignment", "Block", "BooleanLiteral", "BreakStatement", "CastExpression", "CatchClause", "CharacterLiteral",
"ClassInstanceCreation", "CompilationUnit", "ConditionalExpression", "ConstructorInvocation",
"ContinueStatement", "DoStatement", "EmptyStatement", "ExpressionStatement", "FieldAccess", "FieldDeclaration",
"ForStatement", "IfStatement", "ImportDeclaration", "InfixExpression", "Initializer", "Javadoc",
"LabeledStatement", "MethodDeclaration", "MethodInvocation", "NullLiteral", "NumberLiteral",
"PackageDeclaration", "ParenthesizedExpression", "PostfixExpression", "PrefixExpression", "PrimitiveType",
"QualifiedName", "ReturnStatement", "SimpleName", "SimpleType", "SingleVariableDeclaration", "StringLiteral",
"SuperConstructorInvocation", "SuperFieldAccess", "SuperMethodInvocation", "SwitchCase", "SwitchStatement",
"SynchronizedStatement", "ThisExpression", "ThrowStatement", "TryStatement", "TypeDeclaration",
"TypeDeclarationStatement", "TypeLiteral", "VariableDeclarationExpression", "VariableDeclarationFragment",
"VariableDeclarationStatement", "WhileStatement", "InstanceofExpression", "LineComment", "BlockComment",
"TagElement", "TextElement", "MemberRef", "MethodRef", "MethodRefParameter", "EnhancedForStatement",
"EnumDeclaration", "EnumConstantDeclaration", "TypeParameter", "ParameterizedType", "QualifiedType",
"WildcardType", "NormalAnnotation", "MarkerAnnotation", "SingleMemberAnnotation", "MemberValuePair",
"AnnotationTypeDeclaration", "AnnotationTypeMemberDeclaration", "Modifier", "UnionType", "Dimension",
"LambdaExpression", "IntersectionType", "NameQualifiedType", "CreationReference", "ExpressionMethodReference",
"SuperMethodReference", "TypeMethodReference", "MethodName", "Operator", "New", "Instanceof"]
movPattern = 'MOV (' + '|'.join(ast) + ')@@(.*)@TO@ (' + '|'.join(ast) + ')@@(.*)@AT@'
delPattern = 'DEL (' + '|'.join(ast) + ')@@(.*)@AT@'
insPattern = 'INS (' + '|'.join(ast) + ')@@(.*)@TO@ (' + '|'.join(ast) + ')@@(.*)@AT@'
updPattern = 'UPD (' + '|'.join(ast) + ')@@(.*)@TO@(.*)@AT@'
import redis
port = '6380'
redis_db = redis.StrictRedis(host="localhost", port=port, db=0)
redis_db1 = redis.StrictRedis(host="localhost", port=port, db=1)
redis_db2 = redis.StrictRedis(host="localhost", port=port, db=2)
redis_out = redis.StrictRedis(host="localhost", port=6399, db=0)
def getTokens(prefix, i):
dist2load = redis_db1.get(prefix + "-" + i);
with open(join(DATA_PATH, 'actions', prefix.replace('-', '/'), dist2load.decode()), 'r') as rFile:
lines = rFile.read()
lines = re.split("@LENGTH@ \d+", lines)
tokens = []
for line in lines:
line = line.strip().strip('-')
if line is '':
continue
t = []
searchPattern = ''
if line.startswith('INS'):
t = [2]
searchPattern = insPattern
elif line.startswith('UPD'):
t = [2, 3]
searchPattern = updPattern
elif line.startswith('DEL'):
t = [2]
searchPattern = delPattern
elif line.startswith('MOV'):
t = [2]
searchPattern = movPattern
# MOV TryStatement@@try:[] @TO@ MethodDeclaration@@public, T, T, MethodName:lookupByNameAndType, String name, Class<T> type, @AT@ 2164 @LENGTH@ 646
# DEL VariableDeclarationStatement@@Path hfilePath=HFileLink.getHFileFromBackReference(getConf(),filePath); @AT@ 2474 @LENGTH@ 74
# INS ThrowStatement@@MethodInvocation:convertJedisAccessException(ex) @TO@ CatchClause@@Exception ex @AT@ 12194 @LENGTH@ 38
# UPD MethodInvocation@@getVectorExpression(elseDesc,mode) @TO@ getVectorExpression(elseDesc,VectorExpressionDescriptor.Mode.PROJECTION) @AT@ 136925 @LENGTH@ 35
# from common.preprocessing import preprocessingCodeElementsList
# lines = re.sub('@AT@\s*[0-9]+\s*', ' ', lines)
# lines = re.sub('@LENGTH@\s*[0-9]+\s*', ' ', lines)
# lines = re.sub('@TO@', ' ', lines)
# lines = re.sub('@@', ' ', lines)
# lines = re.sub('INS|UPD|MOV|DEL', ' ', lines)
# lines = re.sub('MethodInvocation:', ' ', lines)
# lines = re.sub('Name:', ' ', lines)
# lines = re.sub('|'.join(ast),' ',lines)
from common.preprocessing import preprocessingForSimi
m = re.search(searchPattern, line, re.DOTALL)
if t is None:
print()
if m:
for k in t:
token = m.group(k)
token = re.sub('MethodInvocation:|Name:|MethodName:|SimpleName:|InfixExpression:', ' ',
token)
# token = re.sub(' Name:', ' ', token)
# token = re.sub(' MethodName:', ' ', token)
# token = re.sub(' SimpleName:', ' ', token)
tokens.append(token)
else:
return None
tokens = preprocessingForSimi(tokens)
return tokens
# for key in keys:
def simiCore(key):
split = key.split('_')
prefix = split[0]
i = split[1]
j = split[2]
# inner = innerPool.getResource();
# preCorpusBug = preprocessingCodeElementsList(lines)
# return preCorpusBug
tokensi = getTokens(prefix, i)
tokensj = getTokens(prefix, j)
tokensi
import textdistance
# simi = textdistance.jaccard(tokensi,tokensj)
# simi2 = textdistance.sorensen_dice(' '.join(tokensi), ' '.join(tokensj))
simi2 = textdistance.sorensen_dice(list(unique_everseen(tokensi)), list(unique_everseen(tokensj)))
# simi
#
# from common.preprocessing import calculateTfIdfNLList
#
# if len(tokensj) == 0:
# print()
# if tokensi[0] != [] or tokensj[0] != []:
# v = calculateTfIdfNLList([tokensi])
# sourceDTM = v.transform([tokensi])
# bugDTM = v.transform([tokensj])
# from sklearn.metrics.pairwise import cosine_similarity
#
# res = cosine_similarity(bugDTM, sourceDTM)
# simiScore =res[0][0]
if simi2 >= 0.8:
print(key,simi2)
redis_db2.set(key, simi2)
redis_db.delete(key)