301 lines
8.0 KiB
Python
301 lines
8.0 KiB
Python
from nltk.tokenize import RegexpTokenizer
|
|
# from stop_words import get_stop_words
|
|
from nltk.stem.porter import PorterStemmer
|
|
from string import punctuation
|
|
import re
|
|
from nltk.corpus import stopwords
|
|
en_stop = stopwords.words('english')
|
|
from nltk.corpus import wordnet
|
|
import html
|
|
|
|
from common.commons import *
|
|
CODE_PATH = os.environ["CODE_PATH"]
|
|
|
|
# import spacy
|
|
# nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])
|
|
# nlp.max_length =100000000
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
import sys
|
|
|
|
|
|
def getTokensForPatterns(res):
|
|
if isinstance(res, list):
|
|
merged = str()
|
|
for r in res:
|
|
if isinstance(r, list):
|
|
merged = merged + ' ' + ' '.join(r)
|
|
else:
|
|
merged = merged +' ' + r
|
|
else:
|
|
merged=res
|
|
|
|
res = html.unescape(merged)
|
|
|
|
tokens = getTokens(res,False)
|
|
|
|
stripped = []
|
|
for t in tokens:
|
|
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[|}|{|;',t)
|
|
for s in splits:
|
|
stripped.append(s)
|
|
non_empty = [i for i in stripped if i != '']
|
|
return non_empty
|
|
|
|
def preprocessingCodeElementsList(res):
|
|
printDetail = False
|
|
if isinstance(res, list):
|
|
merged = str()
|
|
for r in res:
|
|
if isinstance(r, list):
|
|
merged = merged + ' ' + ' '.join(r)
|
|
else:
|
|
merged = merged +' ' + r
|
|
else:
|
|
merged=res
|
|
|
|
res = html.unescape(merged)
|
|
|
|
tokens = getTokens(res,printDetail)
|
|
|
|
stripped = []
|
|
for t in tokens:
|
|
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
|
for s in splits:
|
|
stripped.append(s)
|
|
punc = removeEndingPunct(stripped,printDetail)
|
|
|
|
non_empty = [i for i in punc if i != '']
|
|
|
|
stripped = removeEndingPunct(non_empty,printDetail)
|
|
|
|
camelCase = handleCamelCase(stripped,printDetail,True)
|
|
|
|
underScore = handleUnderScore(camelCase,printDetail,True)
|
|
|
|
lower = [i.lower() for i in underScore]
|
|
|
|
# stopped_tokens = [i for i in lower if not i in en_stop]
|
|
|
|
stem2 = stem(lower,printDetail)
|
|
if printDetail:
|
|
print('=====CLEANED=========')
|
|
print(stem2)
|
|
|
|
return stem2
|
|
|
|
def preprocessingForSimi(res):
|
|
printDetail = False
|
|
if isinstance(res, list):
|
|
merged = str()
|
|
for r in res:
|
|
if isinstance(r, list):
|
|
merged = merged + ' ' + ' '.join(r)
|
|
else:
|
|
merged = merged +' ' + r
|
|
else:
|
|
merged=res
|
|
|
|
res = html.unescape(merged)
|
|
|
|
tokens = getTokens(res,printDetail)
|
|
|
|
stripped = []
|
|
for t in tokens:
|
|
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
|
for s in splits:
|
|
stripped.append(s)
|
|
punc = removeEndingPunct(stripped,printDetail)
|
|
|
|
non_empty = [i for i in punc if i != '']
|
|
|
|
stripped = removeEndingPunct(non_empty,printDetail)
|
|
|
|
camelCase = handleCamelCase(stripped,printDetail,False)
|
|
|
|
underScore = handleUnderScore(camelCase,printDetail,False)
|
|
|
|
# lower = [i.lower() for i in underScore]
|
|
|
|
# stopped_tokens = [i for i in lower if not i in en_stop]
|
|
|
|
# stem2 = stem(lower,printDetail)
|
|
# if printDetail:
|
|
# print('=====CLEANED=========')
|
|
# print(stem2)
|
|
|
|
return underScore
|
|
|
|
def preprocessingNL(res):
|
|
try:
|
|
printDetail = False
|
|
|
|
if isinstance(res, list):
|
|
merged = str()
|
|
for r in res:
|
|
if isinstance(r, list):
|
|
merged = merged + ' ' + ' '.join(r)
|
|
else:
|
|
merged = merged +' ' + r
|
|
else:
|
|
merged=res
|
|
|
|
res = html.unescape(merged)
|
|
html_decoded_string = res.replace("&", "&").replace(""", '"').replace("'", "'").replace(">",
|
|
">").replace(
|
|
"<", "<")
|
|
html_decoded_string = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',html_decoded_string)
|
|
|
|
tokens = getTokens(html_decoded_string,printDetail)
|
|
|
|
stripped = []
|
|
for t in tokens:
|
|
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
|
# splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-',t)
|
|
for s in splits:
|
|
stripped.append(s)
|
|
punc = removeEndingPunct(stripped,printDetail)
|
|
|
|
non_empty = [i for i in punc if i != '']
|
|
|
|
stripped = removeEndingPunct(non_empty,printDetail)
|
|
|
|
camelCase = handleCamelCase(stripped,printDetail,True)
|
|
|
|
underScore = handleUnderScore(camelCase,printDetail,True)
|
|
|
|
lower = [i.lower() for i in underScore]
|
|
|
|
stopped_tokens = [i for i in lower if not i in en_stop]
|
|
|
|
nonDigit = [i for i in stopped_tokens if (not i.isdigit())]
|
|
|
|
# doc = nlp(' '.join(nonDigit))
|
|
# newWord = []
|
|
# for token in doc:
|
|
# if(token.text in nlp.vocab):
|
|
# newWord.append(token.text)
|
|
|
|
stem2 = stem(nonDigit,printDetail)
|
|
|
|
if printDetail:
|
|
print('=====CLEANED=========')
|
|
print(stem2)
|
|
|
|
return stem2
|
|
except Exception as e:
|
|
logging.error(e)
|
|
|
|
def getTokens(re,printDetail=False):
|
|
tokenizer = RegexpTokenizer(r'\S+')
|
|
tokens = tokenizer.tokenize(re)
|
|
if printDetail:
|
|
print('=====TOKENS=========')
|
|
print(tokens)
|
|
|
|
return tokens
|
|
|
|
def charLength(x, l=3):
|
|
if x.isalpha() and len(x) >= l:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def removeEndingPunct(re,printDetail):
|
|
stripped = [i.strip(punctuation) for i in re]
|
|
if printDetail:
|
|
print('=====removeEndingPunct=========')
|
|
print(stripped)
|
|
return stripped
|
|
|
|
def handleCamelCase(re,printDetail=False,keepOriginal = False):
|
|
camelCased = list()
|
|
|
|
for i in re:
|
|
listOfCC = camel_case_split(i)
|
|
camelCased.extend(listOfCC)
|
|
if i not in listOfCC and keepOriginal:
|
|
camelCased.append(i)
|
|
|
|
if printDetail:
|
|
print('=====CAMEL CASE=========')
|
|
print(camelCased)
|
|
return camelCased
|
|
|
|
def handleUnderScore(re,printDetail=False,keepOriginal = False):
|
|
underScored = list()
|
|
for i in re:
|
|
listOfCC = i.split('_')
|
|
underScored.extend(listOfCC)
|
|
if i not in listOfCC and keepOriginal:
|
|
underScored.append(i)
|
|
|
|
if printDetail:
|
|
print('=====UNDER SCORE=========')
|
|
print(underScored)
|
|
|
|
return underScored
|
|
|
|
def camel_case_split(identifier):
|
|
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
|
|
res = [m.group(0) for m in matches]
|
|
|
|
return res
|
|
|
|
def stem(res,printDetail):
|
|
p_stemmer = PorterStemmer()
|
|
stemmed_tokens = [p_stemmer.stem(i.strip()) for i in res if i]
|
|
if printDetail:
|
|
print('=====STEMMED=========')
|
|
print(stemmed_tokens)
|
|
return stemmed_tokens
|
|
|
|
def isEnglish(word_to_test):
|
|
if not wordnet.synsets(word_to_test):
|
|
#Not an English Word
|
|
#TODO
|
|
word_to_test
|
|
#print word_to_test
|
|
else:
|
|
return word_to_test
|
|
|
|
|
|
def dummy_fun(doc):
|
|
return doc
|
|
|
|
def calculateTfIdfCodeElementsList(aCorpus):
|
|
global progress
|
|
progress = 0
|
|
v = TfidfVectorizer(tokenizer=dummy_fun,stop_words=None,lowercase=False,sublinear_tf=True)#,max_df=0.7,min_df=3)
|
|
m = v.fit(aCorpus)
|
|
return v
|
|
|
|
def calculateTfIdfNLList(aCorpus):
|
|
global progress
|
|
progress = 0
|
|
v = TfidfVectorizer(tokenizer=dummy_fun,stop_words=None,lowercase=False,sublinear_tf=True)#,max_df=0.7,min_df=3)
|
|
m = v.fit(aCorpus)
|
|
return v
|
|
|
|
def getDTMNL(x,v,corpus):
|
|
ind =x.name
|
|
v.tokenizer = dummy_fun
|
|
return v.transform([corpus[ind]])
|
|
def getDTMCE(x,v,corpus):
|
|
ind =x.name
|
|
v.tokenizer = dummy_fun
|
|
return v.transform([corpus[ind]])
|
|
|
|
def getBRDTM(x,v,corpus):
|
|
ind =x.name
|
|
v.tokenizer = dummy_fun
|
|
return v.transform([corpus[ind]])
|
|
|
|
|
|
def getBRDTMCEs(x,v,corpus):
|
|
ind =x.name
|
|
v.tokenizer = dummy_fun
|
|
return v.transform([corpus[ind]])
|