import re,sys, os, cPickle titleReg = re.compile(r'([^<]+)<\/title>', re.MULTILINE | re.DOTALL) #articleReg = re.compile(r'<text xml:space="preserve">(.*?)</text>', re.MULTILINE | re.DOTALL) articleReg = re.compile(r'<text xml:space="preserve">(.*?)</text>|<text xml:space="preserve"\s?/>', re.MULTILINE | re.DOTALL) def replaceEntities(input): input = input.replace('<','<') input = input.replace('>','>') input = input.replace(''',"'") input = input.replace('"','"') input = input.replace('&','&') return input def removetitledash(input): input = input.replace('/','_') return input def leniter(iterator): """leniter(iterator): return the length of an iterator, consuming it.""" if hasattr(iterator, "__len__"): return len(iterator) nelements = 0 for _ in iterator: nelements += 1 return nelements def folderIncrement(i): stringInt = str(i) while True: if len(stringInt) < 3: stringInt = '0'+stringInt else: return stringInt def main(): if len(sys.argv) < 2: print("Provide wikidump name") sys.exit(0) f = open(sys.argv[-1],'r') breakPointFile = open('breakPoints.py','r') breakPoints = cPickle.load(breakPointFile) currentBreak = 0 #int = 0 folderName = 'filedumps/' if not os.path.exists(folderName): os.mkdir(folderName) breakNum = 0 for breakPoint in breakPoints: print "Break number %s of %s " % (breakNum, len(breakPoints)) breakNum = breakNum + 1 actualBreak = breakPoint - currentBreak #int,folderName = parseWiki(f,actualBreak,int,folderName) dump = f.read(actualBreak) newSplit = open(folderName+"/"+folderIncrement(breakNum)+".txt",'w') newSplit.write(dump) newSplit.close() #print "Titleiterlength is %s" % str(int) currentBreak = breakPoint dump = f.read() newSplit = open(folderName+"/"+folderIncrement(breakNum+1)+".txt",'w') newSplit.write(dump) newSplit.close() f.close() if __name__ == '__main__': main()