import re,sys, os, cPickle
titleReg = re.compile(r'
([^<]+)<\/title>', re.MULTILINE | re.DOTALL)
#articleReg = re.compile(r'(.*?)', re.MULTILINE | re.DOTALL)
articleReg = re.compile(r'(.*?)|', re.MULTILINE | re.DOTALL)
def replaceEntities(input):
input = input.replace('<','<')
input = input.replace('>','>')
input = input.replace(''',"'")
input = input.replace('"','"')
input = input.replace('&','&')
return input
def removetitledash(input):
input = input.replace('/','_')
return input
def leniter(iterator):
"""leniter(iterator): return the length of an iterator,
consuming it."""
if hasattr(iterator, "__len__"):
return len(iterator)
nelements = 0
for _ in iterator:
nelements += 1
return nelements
def folderIncrement(i):
stringInt = str(i)
while True:
if len(stringInt) < 3:
stringInt = '0'+stringInt
else:
return stringInt
def main():
if len(sys.argv) < 2:
print("Provide wikidump name")
sys.exit(0)
f = open(sys.argv[-1],'r')
breakPointFile = open('breakPoints.py','r')
breakPoints = cPickle.load(breakPointFile)
currentBreak = 0
#int = 0
folderName = 'filedumps/'
if not os.path.exists(folderName):
os.mkdir(folderName)
breakNum = 0
for breakPoint in breakPoints:
print "Break number %s of %s " % (breakNum, len(breakPoints))
breakNum = breakNum + 1
actualBreak = breakPoint - currentBreak
#int,folderName = parseWiki(f,actualBreak,int,folderName)
dump = f.read(actualBreak)
newSplit = open(folderName+"/"+folderIncrement(breakNum)+".txt",'w')
newSplit.write(dump)
newSplit.close()
#print "Titleiterlength is %s" % str(int)
currentBreak = breakPoint
dump = f.read()
newSplit = open(folderName+"/"+folderIncrement(breakNum+1)+".txt",'w')
newSplit.write(dump)
newSplit.close()
f.close()
if __name__ == '__main__':
main()