merge python scripts
This commit is contained in:
@@ -0,0 +1,300 @@
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
# from stop_words import get_stop_words
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
from string import punctuation
|
||||
import re
|
||||
from nltk.corpus import stopwords
|
||||
en_stop = stopwords.words('english')
|
||||
from nltk.corpus import wordnet
|
||||
import html
|
||||
|
||||
from common.commons import *
|
||||
CODE_PATH = os.environ["CODE_PATH"]
|
||||
|
||||
# import spacy
|
||||
# nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])
|
||||
# nlp.max_length =100000000
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
def getTokensForPatterns(res):
|
||||
if isinstance(res, list):
|
||||
merged = str()
|
||||
for r in res:
|
||||
if isinstance(r, list):
|
||||
merged = merged + ' ' + ' '.join(r)
|
||||
else:
|
||||
merged = merged +' ' + r
|
||||
else:
|
||||
merged=res
|
||||
|
||||
res = html.unescape(merged)
|
||||
|
||||
tokens = getTokens(res,False)
|
||||
|
||||
stripped = []
|
||||
for t in tokens:
|
||||
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[|}|{|;',t)
|
||||
for s in splits:
|
||||
stripped.append(s)
|
||||
non_empty = [i for i in stripped if i != '']
|
||||
return non_empty
|
||||
|
||||
def preprocessingCodeElementsList(res):
|
||||
printDetail = False
|
||||
if isinstance(res, list):
|
||||
merged = str()
|
||||
for r in res:
|
||||
if isinstance(r, list):
|
||||
merged = merged + ' ' + ' '.join(r)
|
||||
else:
|
||||
merged = merged +' ' + r
|
||||
else:
|
||||
merged=res
|
||||
|
||||
res = html.unescape(merged)
|
||||
|
||||
tokens = getTokens(res,printDetail)
|
||||
|
||||
stripped = []
|
||||
for t in tokens:
|
||||
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
||||
for s in splits:
|
||||
stripped.append(s)
|
||||
punc = removeEndingPunct(stripped,printDetail)
|
||||
|
||||
non_empty = [i for i in punc if i != '']
|
||||
|
||||
stripped = removeEndingPunct(non_empty,printDetail)
|
||||
|
||||
camelCase = handleCamelCase(stripped,printDetail,True)
|
||||
|
||||
underScore = handleUnderScore(camelCase,printDetail,True)
|
||||
|
||||
lower = [i.lower() for i in underScore]
|
||||
|
||||
# stopped_tokens = [i for i in lower if not i in en_stop]
|
||||
|
||||
stem2 = stem(lower,printDetail)
|
||||
if printDetail:
|
||||
print('=====CLEANED=========')
|
||||
print(stem2)
|
||||
|
||||
return stem2
|
||||
|
||||
def preprocessingForSimi(res):
|
||||
printDetail = False
|
||||
if isinstance(res, list):
|
||||
merged = str()
|
||||
for r in res:
|
||||
if isinstance(r, list):
|
||||
merged = merged + ' ' + ' '.join(r)
|
||||
else:
|
||||
merged = merged +' ' + r
|
||||
else:
|
||||
merged=res
|
||||
|
||||
res = html.unescape(merged)
|
||||
|
||||
tokens = getTokens(res,printDetail)
|
||||
|
||||
stripped = []
|
||||
for t in tokens:
|
||||
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
||||
for s in splits:
|
||||
stripped.append(s)
|
||||
punc = removeEndingPunct(stripped,printDetail)
|
||||
|
||||
non_empty = [i for i in punc if i != '']
|
||||
|
||||
stripped = removeEndingPunct(non_empty,printDetail)
|
||||
|
||||
camelCase = handleCamelCase(stripped,printDetail,False)
|
||||
|
||||
underScore = handleUnderScore(camelCase,printDetail,False)
|
||||
|
||||
# lower = [i.lower() for i in underScore]
|
||||
|
||||
# stopped_tokens = [i for i in lower if not i in en_stop]
|
||||
|
||||
# stem2 = stem(lower,printDetail)
|
||||
# if printDetail:
|
||||
# print('=====CLEANED=========')
|
||||
# print(stem2)
|
||||
|
||||
return underScore
|
||||
|
||||
def preprocessingNL(res):
|
||||
try:
|
||||
printDetail = False
|
||||
|
||||
if isinstance(res, list):
|
||||
merged = str()
|
||||
for r in res:
|
||||
if isinstance(r, list):
|
||||
merged = merged + ' ' + ' '.join(r)
|
||||
else:
|
||||
merged = merged +' ' + r
|
||||
else:
|
||||
merged=res
|
||||
|
||||
res = html.unescape(merged)
|
||||
html_decoded_string = res.replace("&", "&").replace(""", '"').replace("'", "'").replace(">",
|
||||
">").replace(
|
||||
"<", "<")
|
||||
html_decoded_string = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '',html_decoded_string)
|
||||
|
||||
tokens = getTokens(html_decoded_string,printDetail)
|
||||
|
||||
stripped = []
|
||||
for t in tokens:
|
||||
splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-|,|\]|\[',t)
|
||||
# splits = re.split('\.|\(|\)|:|>|<|:|=|/|\\\\|\'|-',t)
|
||||
for s in splits:
|
||||
stripped.append(s)
|
||||
punc = removeEndingPunct(stripped,printDetail)
|
||||
|
||||
non_empty = [i for i in punc if i != '']
|
||||
|
||||
stripped = removeEndingPunct(non_empty,printDetail)
|
||||
|
||||
camelCase = handleCamelCase(stripped,printDetail,True)
|
||||
|
||||
underScore = handleUnderScore(camelCase,printDetail,True)
|
||||
|
||||
lower = [i.lower() for i in underScore]
|
||||
|
||||
stopped_tokens = [i for i in lower if not i in en_stop]
|
||||
|
||||
nonDigit = [i for i in stopped_tokens if (not i.isdigit())]
|
||||
|
||||
# doc = nlp(' '.join(nonDigit))
|
||||
# newWord = []
|
||||
# for token in doc:
|
||||
# if(token.text in nlp.vocab):
|
||||
# newWord.append(token.text)
|
||||
|
||||
stem2 = stem(nonDigit,printDetail)
|
||||
|
||||
if printDetail:
|
||||
print('=====CLEANED=========')
|
||||
print(stem2)
|
||||
|
||||
return stem2
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
def getTokens(re,printDetail=False):
|
||||
tokenizer = RegexpTokenizer(r'\S+')
|
||||
tokens = tokenizer.tokenize(re)
|
||||
if printDetail:
|
||||
print('=====TOKENS=========')
|
||||
print(tokens)
|
||||
|
||||
return tokens
|
||||
|
||||
def charLength(x, l=3):
|
||||
if x.isalpha() and len(x) >= l:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def removeEndingPunct(re,printDetail):
|
||||
stripped = [i.strip(punctuation) for i in re]
|
||||
if printDetail:
|
||||
print('=====removeEndingPunct=========')
|
||||
print(stripped)
|
||||
return stripped
|
||||
|
||||
def handleCamelCase(re,printDetail=False,keepOriginal = False):
|
||||
camelCased = list()
|
||||
|
||||
for i in re:
|
||||
listOfCC = camel_case_split(i)
|
||||
camelCased.extend(listOfCC)
|
||||
if i not in listOfCC and keepOriginal:
|
||||
camelCased.append(i)
|
||||
|
||||
if printDetail:
|
||||
print('=====CAMEL CASE=========')
|
||||
print(camelCased)
|
||||
return camelCased
|
||||
|
||||
def handleUnderScore(re,printDetail=False,keepOriginal = False):
|
||||
underScored = list()
|
||||
for i in re:
|
||||
listOfCC = i.split('_')
|
||||
underScored.extend(listOfCC)
|
||||
if i not in listOfCC and keepOriginal:
|
||||
underScored.append(i)
|
||||
|
||||
if printDetail:
|
||||
print('=====UNDER SCORE=========')
|
||||
print(underScored)
|
||||
|
||||
return underScored
|
||||
|
||||
def camel_case_split(identifier):
|
||||
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
|
||||
res = [m.group(0) for m in matches]
|
||||
|
||||
return res
|
||||
|
||||
def stem(res,printDetail):
|
||||
p_stemmer = PorterStemmer()
|
||||
stemmed_tokens = [p_stemmer.stem(i.strip()) for i in res if i]
|
||||
if printDetail:
|
||||
print('=====STEMMED=========')
|
||||
print(stemmed_tokens)
|
||||
return stemmed_tokens
|
||||
|
||||
def isEnglish(word_to_test):
|
||||
if not wordnet.synsets(word_to_test):
|
||||
#Not an English Word
|
||||
#TODO
|
||||
word_to_test
|
||||
#print word_to_test
|
||||
else:
|
||||
return word_to_test
|
||||
|
||||
|
||||
def dummy_fun(doc):
|
||||
return doc
|
||||
|
||||
def calculateTfIdfCodeElementsList(aCorpus):
|
||||
global progress
|
||||
progress = 0
|
||||
v = TfidfVectorizer(tokenizer=dummy_fun,stop_words=None,lowercase=False,sublinear_tf=True)#,max_df=0.7,min_df=3)
|
||||
m = v.fit(aCorpus)
|
||||
return v
|
||||
|
||||
def calculateTfIdfNLList(aCorpus):
|
||||
global progress
|
||||
progress = 0
|
||||
v = TfidfVectorizer(tokenizer=dummy_fun,stop_words=None,lowercase=False,sublinear_tf=True)#,max_df=0.7,min_df=3)
|
||||
m = v.fit(aCorpus)
|
||||
return v
|
||||
|
||||
def getDTMNL(x,v,corpus):
|
||||
ind =x.name
|
||||
v.tokenizer = dummy_fun
|
||||
return v.transform([corpus[ind]])
|
||||
def getDTMCE(x,v,corpus):
|
||||
ind =x.name
|
||||
v.tokenizer = dummy_fun
|
||||
return v.transform([corpus[ind]])
|
||||
|
||||
def getBRDTM(x,v,corpus):
|
||||
ind =x.name
|
||||
v.tokenizer = dummy_fun
|
||||
return v.transform([corpus[ind]])
|
||||
|
||||
|
||||
def getBRDTMCEs(x,v,corpus):
|
||||
ind =x.name
|
||||
v.tokenizer = dummy_fun
|
||||
return v.transform([corpus[ind]])
|
||||
Reference in New Issue
Block a user