# -*- coding: utf-8 -*-


"""  --- Corpus Clean - Stage 1
Copyright (c) 2008-2010 Gisle Ytrestol (gisley@ifi.uio.no)
 
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at
your option) any later version.
 
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.
 

The first stage of this script takes a folder of Wikipedia articles in the 
original Wikipedia markup, and strips the article for unwanted markup. 
The script is intended to be used together with Tokenizer v1.0, 
http://www.cis.uni-muenchen.de/~wastl/misc/.

The output of this script is a single file which should be used as input
for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2,
cccp.py. This second stage inserts proper sentence boundaries and allows
the user to specify whether the output should be one single file for each
corresponding input file, or whether the entire corpus should be dumped 
into on single file/corpus.

Run python ccp.py -h for help!

"""

import os, re, regex,  urllib,cccp,langregex

import sys,string

import codecs


from optparse import OptionParser


class AdjustName:
     
    def addSlash(self,name):
        if name[-1] != '/':
            name = name+'/'
            return name
        else:
            return name
    
    def removeSlash(self,name):
        if name[-1] == '/':
            name = name[:-1]
            return name
        else:
            return name


class WikiReader:
        
    def readFile(self,file):
            wholeFile = file.read()
            return wholeFile
        
    def listFiles(self,inputFolder):
            fileList = os.listdir(inputFolder)
            #print fileList
            newFileList = []
            for file in fileList:
                if not file[0]== '.':
                    newFileList.append(file)
            newFileList.sort()
            #print newFileList
            return newFileList
        
    def readFirstLine(self,file):
            firstLine = file.readline()
            return firstLine


class WikiProcessor:
    def __init__(self,redirect):
        self.dictChecker = WikiDict()
        self.redirect = redirect
        if self.redirect == None:
            self.redirect = 1
        
 
    def tableCleanerOld(self,input):
        input = regex.regtableConvertStart.sub(r'Ӂ',input)
        input = regex.regtableConvertEnd.sub(r'\1ጣ',input)
        while True:
        	if regex.regtableConverted.search(input):
        		input = regex.regtableConverted.sub(r'<EOS />',input)
        	else:
        		break
        input = regex.regtableConvertRevertStart.sub('{|',input)
        input = regex.regtableConvertRevertEnd.sub('|}',input)
        return input
        
    def tableCleaner(self,input):
        while True:
        	if regex.regtableClean.search(input):
        		input = regex.regtableClean.sub(r'<EOS />',input)
        	else:
        		break
        return input
        
        
    def tableCleaner2(self,input):
        splitInput = input.split("\n")
        keepInput = []
        inTable = False
        for inputLine in splitInput:
            if inputLine.startswith("|") or inputLine.startswith("!") :
                if inTable == True:
                    continue
                    regtablestart
            if regex.regtablestart.search(inputLine):
                inTable = True
                continue
            if inputLine.startswith("|}") and inTable == True:
                inTable = False
                continue
            if inTable == True:
                if not inputLine.startswith("|"):
                    if not inputLine.startswith("!"):
                        inTable = False
            keepInput.append(inputLine)
        return "\n".join(keepInput)
        
        
    def tableCleaner2Reverse(self,input):
            splitInput = input.split("\n")
            splitInput.reverse()
            keepInput = []
            inTable = False
            for inputLine in splitInput:
                if inputLine.startswith("|") or inputLine.startswith("!") :
                    if inTable == True:
                        continue
                if inputLine.startswith("|}"):
                    inTable = True
                    continue
                if inputLine.startswith("{|") and inTable == True:
                    inTable = False
                    continue
                if inTable == True:
                    if not inputLine.startswith("|"):
                        if not inputLine.startswith("!"):
                            inTable = False
                keepInput.append(inputLine)
            keepInput.reverse()
            return "\n".join(keepInput)
            
        
    def processFile(self,firstLine,wholeFile):
            if self.redirect == 1:
                try:
                    firstLine, wholeFile = self.redirectCheck(firstLine,wholeFile)
                except:
                    sys.stdout.write("\nERROR WITH THE REDIRECT PROCESSING\nARE YOU SURE AN OFFLINE WIKIPEDIA READER IS ENABLED?\n\nTo run the script without redirect processing, use the -n option\n")
                    #sys.exit(0)
            if self.dictChecker.checkIfIn(firstLine):
                firstLine,wholeFile = self.cleanArticle(firstLine,wholeFile)
                firstLine = self.addTitle(firstLine)
                return firstLine,wholeFile
            else:
                return False, False


    def cleanArticle(self,firstLine,wholeFile):
        firstline = self.regCleanFile(firstLine)
        wholeFile = self.regCleanFile(wholeFile)
        wholeFile = self.removeEnd(wholeFile)
        return firstLine,wholeFile

    def addTitle(self,firstLine):
        firstLine = '<ARTICLE><EOS /><article>'+firstLine.rstrip()+'<NEOS /></article>'
        return firstLine


    def redirectCheck(self,firstLine,wholeFile):    # if the article contains a redirect link, the redirect URL will be used to 
        if regex.regredirect.search(wholeFile):    #retrive the correct article
            title = regex.regredirect.sub(r'\1',wholeFile)
            url = 'http://127.0.0.1:8000/article/'+title    #address must correspond with local Wikipedia
            url = re.sub(r' ','_',url)
            #print url
            page = urllib.urlopen(url)
            page.close()
            thisfile = '/var/tmp/result'    #the wiki article is stored here when it is accessed 
            file = open(thisfile,'r')
            firstLine = file.readline()
            wholeFile = file.read()
            return firstLine,wholeFile
        else:
            return firstLine,wholeFile

        
    def addNewline(self,input):
        input = regex.regeos.sub('\n',input) 
        return input
    
    def removeEnd(self,input):
        input = regex.regsourcelookahead.sub(r'___',input)
        input = regex.regsourcelookahead.sub(r'___',input)
        input = regex.regsourcelookahead.sub(r'___',input)
        
        input = regex.regseealsolookahead.sub(r'___',input)
        input = regex.regseealsolookahead.sub(r'___',input)
        input = regex.regseealsolookahead.sub(r'___',input)
        
        input = regex.regnoteslookahead.sub(r'___',input)
        input = regex.regnoteslookahead.sub(r'___',input)
        input = regex.regnoteslookahead.sub(r'___',input)
        
        input = regex.regreflookahead.sub(r'___',input)
        input = regex.regreflookahead.sub(r'___',input)
        input = regex.regreflookahead.sub(r'___',input)
        
        input = regex.regsealso.sub('',input)   
        input = regex.regnotes.sub('',input)   
        input = regex.regreferences.sub('',input)
        
        
        input = regex.regsources.sub('',input)
        
        input = regex.regsourcelookaheadrestore.sub(r'\2\1\2',input)

        input = regex.regseealsolookaheadrestore.sub(r'\2\1\2',input)
        input = regex.regnoteslookaheadrestore.sub(r'\2\1\2',input)
        input = regex.regreflookaheadrestore.sub(r'\2\1\2',input)
        
        input = regex.regbibliography.sub('',input) 
        input = regex.regfootnotes.sub('',input) 
        input = regex.regrelated.sub('',input) 
        input = regex.regexternal.sub('',input) 
        
        
        return input

    def regCleanFile(self,input):

    
        input = regex.regipa.sub(r'<___\1___>',input)
        input = regex.regjava.sub(r'<___\1___>',input)
        input = regex.regiast.sub(r'<___\1___>',input)
        
        ## japanese article templates
        
        #input = regex.regtransjap.sub(r'\1',input)
        #input = regex.reglangjap.sub(r'\1',input)
        
        
        input = regex.reglanggeneralpreserve.sub(r'<___\1___>',input)
        input = regex.regtransgeneralpreserve.sub(r'<___\1___>',input)
        input = regex.regnihongopreservere.sub(r'<___\1___>',input)
        
        
        input = regex.regharv_general.sub(r'<___\1___>',input)
        input = regex.regaudio_general.sub(r'<___\1___>',input)
        input = regex.regflagtemplate.sub(r'<___\1___>',input)
        
        
        """
        IF WE WANT TO EXPAND THE TEMPLATES, USE THESE!
        input = regex.regharv_aut_aut_year_page.sub(r'(\1 & \2 \3, \4)',input)
        input = regex.regharv_aut_year_page.sub(r'(\1 2, \3)',input)
        input = regex.regharvtxt_aut_year_page.sub(r'\1 (\2, \3)',input)
        input = regex.regharvtxt_aut_year.sub(r'\1 (\2)',input)
        input = regex.regharvtxt_aut_aut_year_page.sub(r'\1 & \2 (\3, \4)',input)
        input = regex.regharvtxt_aut_aut_year.sub(r'\1 & \2 (\3)',input)
        input = regex.regharvnb_aut_year_page_nb.sub(r'\1 \2, \3',input)
        input = regex.regharvnb_aut_year_nb.sub(r'\1 \2',input)
        input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input)
        #input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input)
        input = regex.regharvnb_aut_aut_aut_year_page_nb.sub(r'\1, \2 & \3 \4, \5',input)
        input = regex.regharvnb_aut_aut_year_nb.sub(r'\1 & \2 \3',input)
        input = regex.regharvnb_aut_aut_aut_year_nb.sub(r'\1, \2 & \3 \4',input)
        input = regex.regharvcoltxt_aut_year_page.sub(r'\1 (\2:\3)',input)
        
       	"""
       
       
        """
        TO EXPAND TEMPLATES, USE THESE!
        
        
        #input = regex.regtransgeneral.sub(r'\1',input)
        #input = regex.reglanggeneral.sub(r'\1',input)
        
        input = regex.regnihongohardcode.sub(r'\1)',input)
        input = regex.regnihongojap5.sub(r'\1 (\2 \3 \4 \5)',input)
        input = regex.regnihongojap4.sub(r'\1 (\2 \3 \4)',input)
        input = regex.regnihongojap3.sub(r'\1 (\2 \3)',input)
        input = regex.regnihongojap2.sub(r'\1 (\2)',input)
        """
        
        
        while  regex.regcurly1.search(input):
             input = regex.regcurly1.sub('',input)
        

        input = regex.regblockquote.sub(r'<EOS /><blockquote> <NEOS /> \1 <NEOS /> </blockquote><EOS />',input)
        input = regex.regdiv2.sub('',input)
        input = regex.reggallery.sub('',input)
        input = regex.regimage.sub(r'',input)
        input = regex.regimage.sub(r'',input)
        

        input = regex.regref2.sub('',input)
        input = regex.regref.sub('',input)   
        input = regex.regcomment.sub('',input)
        
        input = regex.regsingleast.sub('',input)
        input = regex.regdeflist.sub(r'\1<EOS />',input)
        

        input = self.tableCleaner(input)
        input = self.tableCleaner2(input)
        input = self.tableCleaner2Reverse(input)

        
        input = regex.regwikitable3.sub('<EOS />',input)

        input = regex.regtable.sub('<EOS />',input)

        input = regex.regtableborder.sub('<EOS />',input)
        
        
        input = regex.regtablehardcode.sub('<EOS />',input)
        
        
        input = regex.regcategory.sub('<EOS />',input)
        input = langregex.purgeLang(input)
        
        
        input = regex.regbacktocurly1.sub('{{',input)
        input = regex.regbacktocurly2.sub('}}',input)
        
        input = regex.regsentinitialbracket.sub(r'\1<EOS />\2',input)
        input = regex.regbracket.sub(r'\1<EOS />', input)
        input = regex.regbullets.sub(r'<EOS />\1<EOS />', input)
        input = regex.regbullets2.sub(r'<EOS />\1<EOS />', input)
        input = regex.regindentcolon.sub(r'<EOS />\1<EOS />', input)
        input = regex.regbulletscolon.sub(r'<EOS />\2<EOS />', input)
        input = regex.regbr.sub(r'<EOS />',input)
        input = regex.regtitle.sub(r'<EOS />\1<EOS />',input)
        input = regex.regparagraph.sub('<EOS />',input)
        input = regex.regyeareos.sub(r'\1<EOS />',input)
        input = regex.regorg.sub(r'\1<EOS />',input)
        input = regex.reghyphen.sub('',input)
        
        #remove no wiki
        #input = regex.regremovenowiki.sub('',input)
        
        
        input = self.removeLines(input)
        
        
        input = regex.regremovenewline.sub(' ',input)
        return input

#    def removeTableLeftover(self,input):
#            splitInput = input.split('\n')
#            newArticle = ''
#            for line in splitInput:
#                if not regex.regletternumber.search(line):
#                    continue
#                if not "ARTICLE>" in line:
#                    if regex.regonlyXML.match(line):
#                        continue
#                if line.startswith("|") or line.startswith("!") or line.startswith("{|") or line.startswith("|}"):
#                   continue
#                else: 
#                    newArticle = newArticle+str(line)+'\n'
#            return newArticle.rstrip()
#                

    def removeLines(self,line):
        splitLine = line.split("\n")
        keepLine = []
        for line in splitLine:
            if regex.regletternumber.search(line):
                keepLine.append(line)
        return "\n".join(keepLine)

class WikiDict:
    def __init__(self,wikiDict=None):
        self.wikiDict = {}

    def checkIfIn(self,firstLine):
        if firstLine in self.wikiDict:
            return False
        else:
            self.wikiDict[firstLine] = ""
            return True
        
class WikiWriter:
    def __init__(self,output):
        self.outFile = open(output,'w')
    
    def writeFile(self,firstLine,wholeFile):
        if wholeFile:
            #print "Writing "+ firstLine
            self.outFile.write('<EOS />'+firstLine+'<EOS />')
            self.outFile.write(wholeFile)
            self.outFile.write('\n</ARTICLE>\n')

    def closeFile(self):
        self.outFile.close()
    

def checkSyntax(input,output):
     if output == None or input == None:
        return False
     if os.path.isdir(input) and not os.path.isdir(output):
         return True 
     else:
         return False


def main():
    parser = OptionParser()
    parser.add_option("-i", "--input", dest="input",
                  help="Input folder where Wikipedia Source files are stored", metavar="Input Folder")
    parser.add_option("-o", "--output", dest="output",
                  help="Output file where cleansed Wikipedia Source files will be stored", metavar="Output File")
    
    parser.add_option("-n", "--noredirects", dest="redirects", action ="store_false",
                      help="No connection to local Wikipedia Reader, therefore no redirect processing.", 
                      metavar="redirects")
    options, args = parser.parse_args()
    input = options.input
    output = options.output
    redirect = options.redirects
    if checkSyntax(input,output):
        pass
    else:
        sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n")
        sys.exit(0)
    wikiReader = WikiReader() 
    wikiProcess = WikiProcessor(redirect)
    wikiWriter = WikiWriter(output)
    adjustName = AdjustName()
    input = adjustName.removeSlash(input)
    fileList = wikiReader.listFiles(input)
    for file in fileList:
        fileName = input+'/'+file
        if os.path.isfile(fileName):
        	
        	#test utf-8
        	#codecs.open( "someFile", "r", "utf-8" )
            #fileObject = open(input+'/'+file,'r','utf-8')


            fileObject = open(input+'/'+file,'r')
            
            
            firstLine = wikiReader.readFirstLine(fileObject)
            wholeFile = wikiReader.readFile(fileObject)
            #print fileName
            #print len(wholeFile)
            firstLine, wholeFile = wikiProcess.processFile(firstLine,wholeFile)
            if firstLine:                #returns false if article already in
                wikiWriter.writeFile(firstLine,wholeFile)
            fileObject.close()
    wikiWriter.closeFile()
    
    
if __name__ == '__main__':
    main()