From ebbc4a2c78a92b8214252915eead903b5991e11b Mon Sep 17 00:00:00 2001 From: Dave Statezni Date: Sun, 10 Sep 2017 22:51:53 -0500 Subject: [PATCH 1/3] Scripts for tnlinter --- libraries/linters/snippet_comparison.py | 265 ++++++++++++++++++++++++ libraries/linters/tn_linter.py | 161 +++++++++++++- 2 files changed, 418 insertions(+), 8 deletions(-) create mode 100644 libraries/linters/snippet_comparison.py diff --git a/libraries/linters/snippet_comparison.py b/libraries/linters/snippet_comparison.py new file mode 100644 index 00000000..d462afe9 --- /dev/null +++ b/libraries/linters/snippet_comparison.py @@ -0,0 +1,265 @@ +''' + snippet_comparison.py +''' + +import re +import httplib2 + +class snippet_comparison(object): + def __init__(self,book,chap,chnk): + self.book = book + self.chap = chap + self.chnk = chnk + self.httplib2_instance = httplib2.Http() + + self.book_directory = { + "GEN" : '01', + "EXO" : '02', + "LEV" : '03', + "NUM" : '04', + "DEU" : '05', + "JOS" : '06', + "JDG" : '07', + "RUT" : '08', + "1SA" : '09', + "2SA" : '10', + "1KI" : '11', + "2KI" : '12', + "1CH" : '13', + "2CH" : '14', + "EZR" : '15', + "NEH" : '16', + "EST" : '17', + "JOB" : '18', + "PSA" : '19', + "PRO" : '20', + "ECC" : '21', + "SNG" : '22', + "ISA" : '23', + "JER" : '24', + "LAM" : '25', + "EZK" : '26', + "DAN" : '27', + "HOS" : '28', + "JOL" : '29', + "AMO" : '30', + "OBA" : '31', + "JON" : '32', + "MIC" : '33', + "NAM" : '34', + "HAB" : '35', + "ZEP" : '36', + "HAG" : '37', + "ZEC" : '38', + "MAL" : '39', + "MAT" : '41', + "MRK" : '42', + "LUK" : '43', + "JHN" : '44', + "ACT" : '45', + "ROM" : '46', + "1CO" : '47', + "2CO" : '48', + "GAL" : '49', + "EPH" : '50', + "PHP" : '51', + "COL" : '52', + "1TH" : '53', + "2TH" : '54', + "1TI" : '55', + "2TI" : '56', + "TIT" : '57', + "PHM" : '58', + "HEB" : '59', + "JAS" : '60', + "1PE" : '61', + "2PE" : '62', + "1JN" : '63', + "2JN" : '64', + "3JN" : '65', + "JUD" : '66', + "REV" : '67' + } + + self.DCSwebaddressmap = { + 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', + 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', + 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' + } + + def getFill(bk): + if 'psa' in bk.lower(): + return 3 + return 2 + + def getulb(self): + upperbook = self.book.upper() + ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" + ulbsrc = ulbDCS + self.book_directory[upperbook] + '-' + upperbook + '.usfm' + resp,content = self.httplib2_instance.request(ulbsrc) + content = re.sub(r'\n','~',content) + ulbbook = content + ulbchapters = re.split(r'\\c\s+',ulbbook) + thischapter = ulbchapters[int(chapter)] + ulbchunks = re.split(r'\\s5',thischapter) + versenum = 1 + usechunk = False + savechunk = '' + for ulbchunk in ulbchunks: + lines = ulbchunk.split('~') + for line in lines: + versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) + if versefound: + versenum = int(versefound.group(1)) + if versenum >= int(chunk): + usechunk = True + if usechunk: + savechunk = savechunk + line + " " + if usechunk: + savechunk = savechunk.replace(' ',' ') + return savechunk + return '' + + def print_error (self,msg): + self.log.warning(msg+self.book+' '+self.chap+":"+self.chnk) + return + + def removepunct (instr) : + ans = instr + ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) + return ans + + def tighter_search (self,sinput,uinput): + compare = True + srch = sinput + u = uinput + srch = re.sub(r'\x2d','\x20',srch) + srch = re.sub(r'{\d+}','',srch) + u = re.sub(r'\x2d','\x20', u) + srch = srch.replace('?','') + #print "srch",srch + swrds = re.split(r'\s+',srch) + srchinustr = "("+srch+")(.+)" + #print "srchinustr,u",srchinustr,'\n',u + srchinu = re.search(r''+srchinustr+'',u) + if srchinu: + remainder = srchinu.group(2) + #print "T_S remainder",remainder + ustr = srchinu.group(1) + remainder + uwrds = re.split(r'\s+',ustr) + uindex = 0 + for swrd in swrds: + if not re.search(r'\'s',swrd): + uwrds[uindex] = re.sub(r'\'s','',uwrds[uindex]) + stest = removepunct(swrd) + #print "swrd,stext",swrd,stest + stest = re.sub(r'ZZZZ','',stest) + utest = removepunct(uwrds[uindex]) + utest = re.sub(r'ZZZZ','',utest) + #print "undx,swrd,stest,utest",uindex,swrd,stest,utest + if stest != utest: + # check if possible follow-on compare + if re.search(r''+srch+'',remainder): + #print "call tighter_search 2nd time" + compare = tighter_search(self,sinput,remainder) + else : + #print "Last word-pair miscompared" + compare = False # no follow-on match, so miscompare + uindex = uindex + 1 + return compare + + def compare_snippet (self,tn, ulb): + global book,chap,chnk,author,comdate + compare = True + eitherelippsis = 0 + snippet = tn + savesnippet = snippet + snippet = re.sub(r'\(','XXXX',snippet) + snippet = re.sub(r'\)','ZZZZ',snippet) + snippet = snippet.replace("?",' QM') + snippet = re.sub(r'\x97',' EMB ',snippet) # em-dash + snippet = re.sub(r'\xe2\x80\x94',' EMB ',snippet) # em-dash + ulb = ulb.replace("?",' QM') + ulb = re.sub(r'\(','XXXX',ulb) + ulb = re.sub(r'\)','ZZZZ',ulb) + ulb = re.sub(r'~',' ',ulb) + ulb = re.sub(r'\\v\s+\d+\s+',' ',ulb) + ulb = re.sub(r'\\q\d+','',ulb) + ulb = re.sub(r'\\q\s+',' ',ulb) + ulb = re.sub(r'\\m','',ulb) + ulb = re.sub(r'\s*\x97\s*',' EMB ',ulb) # em-dash + ulb = re.sub(r'\s*\xe2\x80\x94\s*',' EMB ',ulb) # em-dash + ulb = re.sub(r'\s{2,}',' ',ulb) + if re.search(r'\.\.\.',ulb): + srchulb = ulb + eitherelippsis = 1 + srchulb = re.sub(r'\s+\.\.\.\s+',' ',srchulb) + srchulb = re.sub(r'\s+\.\.\.',' ',srchulb) + srchulb = re.sub(r'\.\.\.\s+',' ',srchulb) + srchulb = re.sub(r'\.\.\.',' ',ulb) + else : + srchulb = ulb + if re.search(r'\.\.\.',snippet): + eitherelippsis = 1 + srchstr = snippet + srchstr = re.sub(r'\s+\.\.\.\s+','^',srchstr) + srchstr = re.sub(r'\s+\.\.\.','^',srchstr) + srchstr = re.sub(r'\.\.\.\s+','^',srchstr) + srchstr = re.sub(r'\.\.\.','^',srchstr) + srchstr = srchstr.replace('^','.+') + else: + srchstr = snippet + #print "SRCHSTR\n",srchstr ,"\nSNIPPET\n",snippet,"\nSRCHULB\n",srchulb + if eitherelippsis == 1: + strinulb = re.search(r''+srchstr+'',srchulb) + if not strinulb : + compare = False + # print "Miscomp with elippsis" + # print "tn,ulb===> ",srchstr,"\n",srchulb + print_error(self,"Snippet miscompare for") + else: + #snippet = snippet + "\x3f" + #print "snippet,ULB",snippet,'\n',srchulb + strinulb = re.search(r''+snippet+'',srchulb) + if strinulb : + #print "call tighter_search" + #print "IN ULB snippet,ULB",snippet,'\n',srchulb + compare = tighter_search (snippet, ulb) + else: + #print "DON'T call tighter_search" + #print 'Not IN ULB snippet,ULB:"'+snippet+'"\n',srchulb + compare = False + if not compare: + # print "Miscomp without elippsis" + # print '\n\n',book,chap,chnk,'\nsavesnippet',savesnippet,"\nsrchULB",srchulb + print_error(self,"Snippet miscompare for") + return compare + + def parse_tn_file(self): + any_error_found = False + bookname = self.book.lower() + zerofillwidth = getFill(book) + chapname = self.chap.zfill(zerofillwidth) + chunkname = self.chnk.zfill(zerofillwidth) + tnDCS = "https://git.door43.org/Door43/en_tn/raw/master/" + tnsrc = tnDCS + bookname + '/' + chapname + '/' + chunkname + '.md' + resp,tncontent = self.httplib2_instance.request(tnsrc) + ulb_chunkdata = getulb(self) + linenumber = 0 + compare = False + tnlines = tncontent.split('\n') + snippet = '' + for iline in tnlines: + linenumber = linenumber + 1 + markerfound = re.search(r'^\#{1}\s+(.+)',iline) + if markerfound: + remainder = markerfound.group(1) + if (not re.search(r'translationWords',remainder)) and (not re.search(r'General Information',remainder)) and (not re.search(r'Connecting Statement',remainder)): + snippet = remainder + compare = compare_snippet(self,snippet, ulb_chunkdata) + if (snippet == ''): + compare = True # Since there were none to compare + return compare + + thiscompare = parse_tn_file(self) + return thiscompare \ No newline at end of file diff --git a/libraries/linters/tn_linter.py b/libraries/linters/tn_linter.py index 2ac5d3d0..de5dd24c 100644 --- a/libraries/linters/tn_linter.py +++ b/libraries/linters/tn_linter.py @@ -1,15 +1,160 @@ from __future__ import print_function, unicode_literals from libraries.linters.markdown_linter import MarkdownLinter +from libraries.linters.snippet_comparison import snippet_comparison +''' + tn_linter.py +''' + +import re +import httplib2 + + class TnLinter(MarkdownLinter): - def lint(self): - """ - Checks for issues with translationNotes + def lint(self): + """ + Checks for issues with translationNotes + Use self.log.warning("message") to log any issues. + self.source_dir is the directory of source files (.md) + :return boolean: + """ + self.httplib2_instance = httplib2.Http() + self.book_directory = { + "GEN" : '01', + "EXO" : '02', + "LEV" : '03', + "NUM" : '04', + "DEU" : '05', + "JOS" : '06', + "JDG" : '07', + "RUT" : '08', + "1SA" : '09', + "2SA" : '10', + "1KI" : '11', + "2KI" : '12', + "1CH" : '13', + "2CH" : '14', + "EZR" : '15', + "NEH" : '16', + "EST" : '17', + "JOB" : '18', + "PSA" : '19', + "PRO" : '20', + "ECC" : '21', + "SNG" : '22', + "ISA" : '23', + "JER" : '24', + "LAM" : '25', + "EZK" : '26', + "DAN" : '27', + "HOS" : '28', + "JOL" : '29', + "AMO" : '30', + "OBA" : '31', + "JON" : '32', + "MIC" : '33', + "NAM" : '34', + "HAB" : '35', + "ZEP" : '36', + "HAG" : '37', + "ZEC" : '38', + "MAL" : '39', + "MAT" : '41', + "MRK" : '42', + "LUK" : '43', + "JHN" : '44', + "ACT" : '45', + "ROM" : '46', + "1CO" : '47', + "2CO" : '48', + "GAL" : '49', + "EPH" : '50', + "PHP" : '51', + "COL" : '52', + "1TH" : '53', + "2TH" : '54', + "1TI" : '55', + "2TI" : '56', + "TIT" : '57', + "PHM" : '58', + "HEB" : '59', + "JAS" : '60', + "1PE" : '61', + "2PE" : '62', + "1JN" : '63', + "2JN" : '64', + "3JN" : '65', + "JUD" : '66', + "REV" : '67' + } + + self.DCSwebaddressmap = { + 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', + 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', + 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' + } + + return super(TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter + + def getFill(bk): + if 'psa' in bk.lower(): + return 3 + return 2 + + def getulb(self): + upperbook = self.book.upper() + ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" + ulbsrc = ulbDCS + self.book_directory[upperbook] + '-' + upperbook + '.usfm' + resp,content = self.httplib2_instance.request(ulbsrc) + content = re.sub(r'\n','~',content) + ulbbook = content + ulbchapters = re.split(r'\\c\s+',ulbbook) + thischapter = ulbchapters[int(chapter)] + ulbchunks = re.split(r'\\s5',thischapter) + versenum = 1 + usechunk = False + savechunk = '' + for ulbchunk in ulbchunks: + lines = ulbchunk.split('~') + for line in lines: + versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) + if versefound: + versenum = int(versefound.group(1)) + if versenum >= int(chunk): + usechunk = True + if usechunk: + savechunk = savechunk + line + " " + if usechunk: + savechunk = savechunk.replace(' ',' ') + return savechunk + return '' + + def removepunct (instr) : + ans = instr + ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) + return ans + + def linter(self): + +# REMOVE comment below for DCS integration, and delete following line +# compare_url = self.compare_url + compare_url = "https://git.door43.org/Door43/en_tn/compare/b0459647bf6e0998b61d3095f183a7bc636678b8...52739c834a38525a86e5da7990eea7265cb76052" + + all_compared = True + + findmodule = re.compile(r'(\w{3}\/\d{2,3}\/\d{2,3}\.md)') + resp,tncontent = self.httplib2_instance.request(compare_url) + elements = [] + for i, m in enumerate(findmodule.finditer(tncontent)): + elements.append(m.group(2)) + for onefile in elements: + onefile = onefile.replace(".md","") + fnpieces = onefile.split("/") + book = fnpieces[0] + chap = fnpieces[1] + chnk = fnpieces[2] + this_compare = snippet_comparison(book,chap,chnk) + all_compared = all_compared and this_compare - Use self.log.warning("message") to log any issues. - self.source_dir is the directory of source files (.md) - :return boolean: - """ - return super(TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter From 24e12c67c05d9a9de6d6c0301edcba2b405f36c5 Mon Sep 17 00:00:00 2001 From: Dave Statezni Date: Sat, 16 Sep 2017 12:42:54 -0500 Subject: [PATCH 2/3] Correct python syntax issues --- libraries/linters/snippet_comparison.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libraries/linters/snippet_comparison.py b/libraries/linters/snippet_comparison.py index d462afe9..5a74d841 100644 --- a/libraries/linters/snippet_comparison.py +++ b/libraries/linters/snippet_comparison.py @@ -86,6 +86,7 @@ def __init__(self,book,chap,chnk): 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' } + thiscompare = parse_tn_file(self) def getFill(bk): if 'psa' in bk.lower(): @@ -261,5 +262,3 @@ def parse_tn_file(self): compare = True # Since there were none to compare return compare - thiscompare = parse_tn_file(self) - return thiscompare \ No newline at end of file From 7b663ba3fb8c38dcfb0b0aed4ec98dda7f378ab0 Mon Sep 17 00:00:00 2001 From: Dave Statezni Date: Tue, 26 Sep 2017 10:50:26 -0500 Subject: [PATCH 3/3] Update tn_linter and snippet_comparison towrds DCS standards --- libraries/linters/snippet_comparison.py | 430 ++++++++++-------------- libraries/linters/tn_linter.py | 212 ++++-------- 2 files changed, 254 insertions(+), 388 deletions(-) diff --git a/libraries/linters/snippet_comparison.py b/libraries/linters/snippet_comparison.py index 5a74d841..e0e6cfd5 100644 --- a/libraries/linters/snippet_comparison.py +++ b/libraries/linters/snippet_comparison.py @@ -3,262 +3,194 @@ ''' import re -import httplib2 +from libraries.door43_tools import bible_books +from libraries.general_tools import url_utils class snippet_comparison(object): - def __init__(self,book,chap,chnk): - self.book = book - self.chap = chap - self.chnk = chnk - self.httplib2_instance = httplib2.Http() - - self.book_directory = { - "GEN" : '01', - "EXO" : '02', - "LEV" : '03', - "NUM" : '04', - "DEU" : '05', - "JOS" : '06', - "JDG" : '07', - "RUT" : '08', - "1SA" : '09', - "2SA" : '10', - "1KI" : '11', - "2KI" : '12', - "1CH" : '13', - "2CH" : '14', - "EZR" : '15', - "NEH" : '16', - "EST" : '17', - "JOB" : '18', - "PSA" : '19', - "PRO" : '20', - "ECC" : '21', - "SNG" : '22', - "ISA" : '23', - "JER" : '24', - "LAM" : '25', - "EZK" : '26', - "DAN" : '27', - "HOS" : '28', - "JOL" : '29', - "AMO" : '30', - "OBA" : '31', - "JON" : '32', - "MIC" : '33', - "NAM" : '34', - "HAB" : '35', - "ZEP" : '36', - "HAG" : '37', - "ZEC" : '38', - "MAL" : '39', - "MAT" : '41', - "MRK" : '42', - "LUK" : '43', - "JHN" : '44', - "ACT" : '45', - "ROM" : '46', - "1CO" : '47', - "2CO" : '48', - "GAL" : '49', - "EPH" : '50', - "PHP" : '51', - "COL" : '52', - "1TH" : '53', - "2TH" : '54', - "1TI" : '55', - "2TI" : '56', - "TIT" : '57', - "PHM" : '58', - "HEB" : '59', - "JAS" : '60', - "1PE" : '61', - "2PE" : '62', - "1JN" : '63', - "2JN" : '64', - "3JN" : '65', - "JUD" : '66', - "REV" : '67' - } + def __init__(self,book,chap,chnk): + self.book = book + self.chap = chap + self.chnk = chnk - self.DCSwebaddressmap = { - 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', - 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', - 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' - } - thiscompare = parse_tn_file(self) - - def getFill(bk): - if 'psa' in bk.lower(): - return 3 - return 2 + self.DCSwebaddressmap = { + 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', + 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', + 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' + } + thiscompare = parse_tn_file(self) + + def getFill(bk): + if 'psa' in bk.lower(): + return 3 + return 2 - def getulb(self): - upperbook = self.book.upper() - ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" - ulbsrc = ulbDCS + self.book_directory[upperbook] + '-' + upperbook + '.usfm' - resp,content = self.httplib2_instance.request(ulbsrc) - content = re.sub(r'\n','~',content) - ulbbook = content - ulbchapters = re.split(r'\\c\s+',ulbbook) - thischapter = ulbchapters[int(chapter)] - ulbchunks = re.split(r'\\s5',thischapter) - versenum = 1 - usechunk = False - savechunk = '' - for ulbchunk in ulbchunks: - lines = ulbchunk.split('~') - for line in lines: - versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) - if versefound: - versenum = int(versefound.group(1)) - if versenum >= int(chunk): - usechunk = True - if usechunk: - savechunk = savechunk + line + " " - if usechunk: - savechunk = savechunk.replace(' ',' ') - return savechunk - return '' + def getulb(self): + lowerbook = self.book.lower() + upperbook = self.book.upper() + ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" + ulbsrc = ulbDCS + bible_books.BOOK_NUMBERS[lowerbook] + '-' + upperbook + '.usfm' + content = url_utils.get_url(ulbsrc) # resp,content = self.httplib2_instance.request(ulbsrc) + content = re.sub(r'\n','~',content) + ulbbook = content + ulbchapters = re.split(r'\\c\s+',ulbbook) + thischapter = ulbchapters[int(chapter)] + ulbchunks = re.split(r'\\s5',thischapter) + versenum = 1 + usechunk = False + savechunk = '' + for ulbchunk in ulbchunks: + lines = ulbchunk.split('~') + for line in lines: + versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) + if versefound: + versenum = int(versefound.group(1)) + if versenum >= int(chunk): + usechunk = True + if usechunk: + savechunk = savechunk + line + " " + if usechunk: + savechunk = savechunk.replace(' ',' ') + return savechunk + return '' - def print_error (self,msg): - self.log.warning(msg+self.book+' '+self.chap+":"+self.chnk) - return + def print_error (self,msg): + self.log.warning(msg+self.book+' '+self.chap+":"+self.chnk) + return - def removepunct (instr) : - ans = instr - ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) - return ans + def removepunct (instr) : + ans = instr + ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) + return ans - def tighter_search (self,sinput,uinput): - compare = True - srch = sinput - u = uinput - srch = re.sub(r'\x2d','\x20',srch) - srch = re.sub(r'{\d+}','',srch) - u = re.sub(r'\x2d','\x20', u) - srch = srch.replace('?','') - #print "srch",srch - swrds = re.split(r'\s+',srch) - srchinustr = "("+srch+")(.+)" - #print "srchinustr,u",srchinustr,'\n',u - srchinu = re.search(r''+srchinustr+'',u) - if srchinu: - remainder = srchinu.group(2) - #print "T_S remainder",remainder - ustr = srchinu.group(1) + remainder - uwrds = re.split(r'\s+',ustr) - uindex = 0 - for swrd in swrds: - if not re.search(r'\'s',swrd): - uwrds[uindex] = re.sub(r'\'s','',uwrds[uindex]) - stest = removepunct(swrd) - #print "swrd,stext",swrd,stest - stest = re.sub(r'ZZZZ','',stest) - utest = removepunct(uwrds[uindex]) - utest = re.sub(r'ZZZZ','',utest) - #print "undx,swrd,stest,utest",uindex,swrd,stest,utest - if stest != utest: - # check if possible follow-on compare - if re.search(r''+srch+'',remainder): - #print "call tighter_search 2nd time" - compare = tighter_search(self,sinput,remainder) - else : - #print "Last word-pair miscompared" - compare = False # no follow-on match, so miscompare - uindex = uindex + 1 - return compare + def tighter_search (self,sinput,uinput): + compare = True + srch = sinput + u = uinput + srch = re.sub(r'\x2d','\x20',srch) + srch = re.sub(r'{\d+}','',srch) + u = re.sub(r'\x2d','\x20', u) + srch = srch.replace('?','') + #print "srch",srch + swrds = re.split(r'\s+',srch) + srchinustr = "("+srch+")(.+)" + #print "srchinustr,u",srchinustr,'\n',u + srchinu = re.search(r''+srchinustr+'',u) + if srchinu: + remainder = srchinu.group(2) + #print "T_S remainder",remainder + ustr = srchinu.group(1) + remainder + uwrds = re.split(r'\s+',ustr) + uindex = 0 + for swrd in swrds: + if not re.search(r'\'s',swrd): + uwrds[uindex] = re.sub(r'\'s','',uwrds[uindex]) + stest = removepunct(swrd) + #print "swrd,stext",swrd,stest + stest = re.sub(r'ZZZZ','',stest) + utest = removepunct(uwrds[uindex]) + utest = re.sub(r'ZZZZ','',utest) + #print "undx,swrd,stest,utest",uindex,swrd,stest,utest + if stest != utest: + # check if possible follow-on compare + if re.search(r''+srch+'',remainder): + #print "call tighter_search 2nd time" + compare = tighter_search(self,sinput,remainder) + else : + #print "Last word-pair miscompared" + compare = False # no follow-on match, so miscompare + uindex = uindex + 1 + return compare - def compare_snippet (self,tn, ulb): - global book,chap,chnk,author,comdate - compare = True - eitherelippsis = 0 - snippet = tn - savesnippet = snippet - snippet = re.sub(r'\(','XXXX',snippet) - snippet = re.sub(r'\)','ZZZZ',snippet) - snippet = snippet.replace("?",' QM') - snippet = re.sub(r'\x97',' EMB ',snippet) # em-dash - snippet = re.sub(r'\xe2\x80\x94',' EMB ',snippet) # em-dash - ulb = ulb.replace("?",' QM') - ulb = re.sub(r'\(','XXXX',ulb) - ulb = re.sub(r'\)','ZZZZ',ulb) - ulb = re.sub(r'~',' ',ulb) - ulb = re.sub(r'\\v\s+\d+\s+',' ',ulb) - ulb = re.sub(r'\\q\d+','',ulb) - ulb = re.sub(r'\\q\s+',' ',ulb) - ulb = re.sub(r'\\m','',ulb) - ulb = re.sub(r'\s*\x97\s*',' EMB ',ulb) # em-dash - ulb = re.sub(r'\s*\xe2\x80\x94\s*',' EMB ',ulb) # em-dash - ulb = re.sub(r'\s{2,}',' ',ulb) - if re.search(r'\.\.\.',ulb): - srchulb = ulb - eitherelippsis = 1 - srchulb = re.sub(r'\s+\.\.\.\s+',' ',srchulb) - srchulb = re.sub(r'\s+\.\.\.',' ',srchulb) - srchulb = re.sub(r'\.\.\.\s+',' ',srchulb) - srchulb = re.sub(r'\.\.\.',' ',ulb) - else : - srchulb = ulb - if re.search(r'\.\.\.',snippet): - eitherelippsis = 1 - srchstr = snippet - srchstr = re.sub(r'\s+\.\.\.\s+','^',srchstr) - srchstr = re.sub(r'\s+\.\.\.','^',srchstr) - srchstr = re.sub(r'\.\.\.\s+','^',srchstr) - srchstr = re.sub(r'\.\.\.','^',srchstr) - srchstr = srchstr.replace('^','.+') - else: - srchstr = snippet - #print "SRCHSTR\n",srchstr ,"\nSNIPPET\n",snippet,"\nSRCHULB\n",srchulb - if eitherelippsis == 1: - strinulb = re.search(r''+srchstr+'',srchulb) - if not strinulb : - compare = False - # print "Miscomp with elippsis" - # print "tn,ulb===> ",srchstr,"\n",srchulb - print_error(self,"Snippet miscompare for") - else: - #snippet = snippet + "\x3f" - #print "snippet,ULB",snippet,'\n',srchulb - strinulb = re.search(r''+snippet+'',srchulb) - if strinulb : - #print "call tighter_search" - #print "IN ULB snippet,ULB",snippet,'\n',srchulb - compare = tighter_search (snippet, ulb) - else: - #print "DON'T call tighter_search" - #print 'Not IN ULB snippet,ULB:"'+snippet+'"\n',srchulb - compare = False - if not compare: - # print "Miscomp without elippsis" - # print '\n\n',book,chap,chnk,'\nsavesnippet',savesnippet,"\nsrchULB",srchulb - print_error(self,"Snippet miscompare for") - return compare + def compare_snippet (self,tn, ulb): + global book,chap,chnk,author,comdate + compare = True + eitherelippsis = 0 + snippet = tn + savesnippet = snippet + snippet = re.sub(r'\(','XXXX',snippet) + snippet = re.sub(r'\)','ZZZZ',snippet) + snippet = snippet.replace("?",' QM') + snippet = re.sub(r'\x97',' EMB ',snippet) # em-dash + snippet = re.sub(r'\xe2\x80\x94',' EMB ',snippet) # em-dash + ulb = ulb.replace("?",' QM') + ulb = re.sub(r'\(','XXXX',ulb) + ulb = re.sub(r'\)','ZZZZ',ulb) + ulb = re.sub(r'~',' ',ulb) + ulb = re.sub(r'\\v\s+\d+\s+',' ',ulb) + ulb = re.sub(r'\\q\d+','',ulb) + ulb = re.sub(r'\\q\s+',' ',ulb) + ulb = re.sub(r'\\m','',ulb) + ulb = re.sub(r'\s*\x97\s*',' EMB ',ulb) # em-dash + ulb = re.sub(r'\s*\xe2\x80\x94\s*',' EMB ',ulb) # em-dash + ulb = re.sub(r'\s{2,}',' ',ulb) + if re.search(r'\.\.\.',ulb): + srchulb = ulb + eitherelippsis = 1 + srchulb = re.sub(r'\s+\.\.\.\s+',' ',srchulb) + srchulb = re.sub(r'\s+\.\.\.',' ',srchulb) + srchulb = re.sub(r'\.\.\.\s+',' ',srchulb) + srchulb = re.sub(r'\.\.\.',' ',ulb) + else : + srchulb = ulb + if re.search(r'\.\.\.',snippet): + eitherelippsis = 1 + srchstr = snippet + srchstr = re.sub(r'\s+\.\.\.\s+','^',srchstr) + srchstr = re.sub(r'\s+\.\.\.','^',srchstr) + srchstr = re.sub(r'\.\.\.\s+','^',srchstr) + srchstr = re.sub(r'\.\.\.','^',srchstr) + srchstr = srchstr.replace('^','.+') + else: + srchstr = snippet + #print "SRCHSTR\n",srchstr ,"\nSNIPPET\n",snippet,"\nSRCHULB\n",srchulb + if eitherelippsis == 1: + strinulb = re.search(r''+srchstr+'',srchulb) + if not strinulb : + compare = False + # print "Miscomp with elippsis" + # print "tn,ulb===> ",srchstr,"\n",srchulb + print_error(self,"Snippet miscompare for") + else: + #snippet = snippet + "\x3f" + #print "snippet,ULB",snippet,'\n',srchulb + strinulb = re.search(r''+snippet+'',srchulb) + if strinulb : + #print "call tighter_search" + #print "IN ULB snippet,ULB",snippet,'\n',srchulb + compare = tighter_search (snippet, ulb) + else: + #print "DON'T call tighter_search" + #print 'Not IN ULB snippet,ULB:"'+snippet+'"\n',srchulb + compare = False + if not compare: + # print "Miscomp without elippsis" + # print '\n\n',book,chap,chnk,'\nsavesnippet',savesnippet,"\nsrchULB",srchulb + print_error(self,"Snippet miscompare for") + return compare - def parse_tn_file(self): - any_error_found = False - bookname = self.book.lower() - zerofillwidth = getFill(book) - chapname = self.chap.zfill(zerofillwidth) - chunkname = self.chnk.zfill(zerofillwidth) - tnDCS = "https://git.door43.org/Door43/en_tn/raw/master/" - tnsrc = tnDCS + bookname + '/' + chapname + '/' + chunkname + '.md' - resp,tncontent = self.httplib2_instance.request(tnsrc) - ulb_chunkdata = getulb(self) - linenumber = 0 - compare = False - tnlines = tncontent.split('\n') - snippet = '' - for iline in tnlines: - linenumber = linenumber + 1 - markerfound = re.search(r'^\#{1}\s+(.+)',iline) - if markerfound: - remainder = markerfound.group(1) - if (not re.search(r'translationWords',remainder)) and (not re.search(r'General Information',remainder)) and (not re.search(r'Connecting Statement',remainder)): - snippet = remainder - compare = compare_snippet(self,snippet, ulb_chunkdata) - if (snippet == ''): - compare = True # Since there were none to compare - return compare + def parse_tn_file(self): + any_error_found = False + bookname = self.book.lower() + zerofillwidth = getFill(book) + chapname = self.chap.zfill(zerofillwidth) + chunkname = self.chnk.zfill(zerofillwidth) + tnDCS = "https://git.door43.org/Door43/en_tn/raw/master/" + tnsrc = tnDCS + bookname + '/' + chapname + '/' + chunkname + '.md' + tncontent = url_utils.get_url(tnsrc) # resp,tncontent = self.httplib2_instance.request(tnsrc) + ulb_chunkdata = getulb(self) + linenumber = 0 + compare = False + tnlines = tncontent.split('\n') + snippet = '' + for iline in tnlines: + linenumber = linenumber + 1 + markerfound = re.search(r'^\#{1}\s+(.+)',iline) + if markerfound: + remainder = markerfound.group(1) + if (not re.search(r'translationWords',remainder)) and (not re.search(r'General Information',remainder)) and (not re.search(r'Connecting Statement',remainder)): + snippet = remainder + compare = compare_snippet(self,snippet, ulb_chunkdata) + if (snippet == ''): + compare = True # Since there were none to compare + return compare diff --git a/libraries/linters/tn_linter.py b/libraries/linters/tn_linter.py index de5dd24c..47cc1da0 100644 --- a/libraries/linters/tn_linter.py +++ b/libraries/linters/tn_linter.py @@ -2,159 +2,93 @@ from libraries.linters.markdown_linter import MarkdownLinter from libraries.linters.snippet_comparison import snippet_comparison +from libraries.door43_tools import bible_books +from libraries.general_tools import url_utils + +import re + ''' tn_linter.py ''' - -import re -import httplib2 +#import httplib2 class TnLinter(MarkdownLinter): - def lint(self): - """ - Checks for issues with translationNotes - Use self.log.warning("message") to log any issues. - self.source_dir is the directory of source files (.md) - :return boolean: - """ - self.httplib2_instance = httplib2.Http() - self.book_directory = { - "GEN" : '01', - "EXO" : '02', - "LEV" : '03', - "NUM" : '04', - "DEU" : '05', - "JOS" : '06', - "JDG" : '07', - "RUT" : '08', - "1SA" : '09', - "2SA" : '10', - "1KI" : '11', - "2KI" : '12', - "1CH" : '13', - "2CH" : '14', - "EZR" : '15', - "NEH" : '16', - "EST" : '17', - "JOB" : '18', - "PSA" : '19', - "PRO" : '20', - "ECC" : '21', - "SNG" : '22', - "ISA" : '23', - "JER" : '24', - "LAM" : '25', - "EZK" : '26', - "DAN" : '27', - "HOS" : '28', - "JOL" : '29', - "AMO" : '30', - "OBA" : '31', - "JON" : '32', - "MIC" : '33', - "NAM" : '34', - "HAB" : '35', - "ZEP" : '36', - "HAG" : '37', - "ZEC" : '38', - "MAL" : '39', - "MAT" : '41', - "MRK" : '42', - "LUK" : '43', - "JHN" : '44', - "ACT" : '45', - "ROM" : '46', - "1CO" : '47', - "2CO" : '48', - "GAL" : '49', - "EPH" : '50', - "PHP" : '51', - "COL" : '52', - "1TH" : '53', - "2TH" : '54', - "1TI" : '55', - "2TI" : '56', - "TIT" : '57', - "PHM" : '58', - "HEB" : '59', - "JAS" : '60', - "1PE" : '61', - "2PE" : '62', - "1JN" : '63', - "2JN" : '64', - "3JN" : '65', - "JUD" : '66', - "REV" : '67' - } - - self.DCSwebaddressmap = { - 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', - 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', - 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' - } + def lint(self): + """ + Checks for issues with translationNotes + Use self.log.warning("message") to log any issues. + self.source_dir is the directory of source files (.md) + :return boolean: + """ + self.DCSwebaddressmap = { + 'en_tn' : 'https://git.door43.org/Door43/en_tn/raw/master/', + 'en_ulb' : 'https://git.door43.org/Door43/en_ulb/raw/master/', + 'en_ugl' : 'https://git.door43.org/Door43/en_ugl/raw/master/' + } - return super(TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter + return super(TnLinter, self).lint() # Runs checks on Markdown, using the markdown linter - def getFill(bk): - if 'psa' in bk.lower(): - return 3 - return 2 + def getFill(bk): + if 'psa' in bk.lower(): + return 3 + return 2 - def getulb(self): - upperbook = self.book.upper() - ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" - ulbsrc = ulbDCS + self.book_directory[upperbook] + '-' + upperbook + '.usfm' - resp,content = self.httplib2_instance.request(ulbsrc) - content = re.sub(r'\n','~',content) - ulbbook = content - ulbchapters = re.split(r'\\c\s+',ulbbook) - thischapter = ulbchapters[int(chapter)] - ulbchunks = re.split(r'\\s5',thischapter) - versenum = 1 - usechunk = False - savechunk = '' - for ulbchunk in ulbchunks: - lines = ulbchunk.split('~') - for line in lines: - versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) - if versefound: - versenum = int(versefound.group(1)) - if versenum >= int(chunk): - usechunk = True - if usechunk: - savechunk = savechunk + line + " " - if usechunk: - savechunk = savechunk.replace(' ',' ') - return savechunk - return '' + def getulb(self): + lowerbook = self.book.lower() + upperbook = self.book.upper() + ulbDCS = "https://git.door43.org/Door43/en_ulb/raw/master/" + ulbsrc = ulbDCS + bible_books.BOOK_NUMBERS[lowerbook] + '-' + upperbook + '.usfm' + content = url_utils.get_url(ulbsrc) # self.httplib2_instance.request(ulbsrc) + content = re.sub(r'\n','~',content) + ulbbook = content + ulbchapters = re.split(r'\\c\s+',ulbbook) + thischapter = ulbchapters[int(chapter)] + ulbchunks = re.split(r'\\s5',thischapter) + versenum = 1 + usechunk = False + savechunk = '' + for ulbchunk in ulbchunks: + lines = ulbchunk.split('~') + for line in lines: + versefound = re.search(r'\\v\s+(\d+)\s+(.+)',line) + if versefound: + versenum = int(versefound.group(1)) + if versenum >= int(chunk): + usechunk = True + if usechunk: + savechunk = savechunk + line + " " + if usechunk: + savechunk = savechunk.replace(' ',' ') + return savechunk + return '' - def removepunct (instr) : - ans = instr - ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) - return ans + def removepunct (instr) : + ans = instr + ans = re.sub(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]','',ans) + return ans - def linter(self): + def linter(self): # REMOVE comment below for DCS integration, and delete following line -# compare_url = self.compare_url - compare_url = "https://git.door43.org/Door43/en_tn/compare/b0459647bf6e0998b61d3095f183a7bc636678b8...52739c834a38525a86e5da7990eea7265cb76052" +# compare_url = self.compare_url + compare_url = "https://git.door43.org/Door43/en_tn/compare/b0459647bf6e0998b61d3095f183a7bc636678b8...52739c834a38525a86e5da7990eea7265cb76052" - all_compared = True - - findmodule = re.compile(r'(\w{3}\/\d{2,3}\/\d{2,3}\.md)') - resp,tncontent = self.httplib2_instance.request(compare_url) - elements = [] - for i, m in enumerate(findmodule.finditer(tncontent)): - elements.append(m.group(2)) - for onefile in elements: - onefile = onefile.replace(".md","") - fnpieces = onefile.split("/") - book = fnpieces[0] - chap = fnpieces[1] - chnk = fnpieces[2] - this_compare = snippet_comparison(book,chap,chnk) - all_compared = all_compared and this_compare + all_compared = True + + findmodule = re.compile(r'(\w{3}\/\d{2,3}\/\d{2,3}\.md)') + tncontent = url_utils.get_url(compare_url) # self.httplib2_instance.request(compare_url) + elements = [] + for i, m in enumerate(findmodule.finditer(tncontent)): + elements.append(m.group(2)) + for onefile in elements: + onefile = onefile.replace(".md","") + fnpieces = onefile.split("/") + book = fnpieces[0] + chap = fnpieces[1] + chnk = fnpieces[2] + this_compare = snippet_comparison(book,chap,chnk) + all_compared = all_compared and this_compare