import re,sys, os, cPickle, codecs, string
"""
to find breakpoints, use:
grep -b "" enwiki-dump > pagebreaks.txt
then:
cat pagebreaks.txt | python findBreakPoints.py
then:
python wikidumpArticleparser2.py enwiki-dump
"""
titleReg = re.compile(r'
([^<]+)<\/title>', re.MULTILINE | re.DOTALL)
#articleReg = re.compile(r'(.*?)', re.MULTILINE | re.DOTALL)
articleReg = re.compile(r'(.*?)|', re.MULTILINE | re.DOTALL)
def replaceEntities(input):
input = input.replace('<','<')
input = input.replace('>','>')
input = input.replace(''',"'")
input = input.replace('"','"')
input = input.replace('&','&')
return input
def removetitledash(input):
input = input.replace('/','_')
return input
def folderIncrement(int):
stringInt = str(int)
while True:
if len(stringInt) < 8:
stringInt = '0'+stringInt
else:
return stringInt
def parseWiki(file,folderName,weScienceDict,moveArticles):
f = file
dump = f.read()
titleIter = re.finditer(titleReg,dump.encode("utf-8"))
articleIter = re.finditer(articleReg,dump.encode("utf-8"))
i = 1
throwOutFolder = 'throwout/'
while True:
try:
articleName = titleIter.next().group(1)
articleContent = articleIter.next().group(1)
if articleContent:
articleContent = replaceEntities(articleContent)
else:
articleContent = ''
if articleName:
articleName = removetitledash(articleName)
articleName = replaceEntities(articleName)
else:
articleName = ''
if len(articleContent) > 2000:
if not articleName in weScienceDict:
if checkIfMoving(articleName,moveArticles):
newArticle = open(folderName+articleName[:100],'w')
newArticle.write(articleName+'\n')
newArticle.write(articleContent)
newArticle.write('\n')
newArticle.close()
if (i%5000 == 0):
print("Finished parsing %s articles" % i)
i = i + 1
else:
newArticle = open(throwOutFolder+articleName[:100],'w')
newArticle.write(articleName+'\n')
newArticle.write(articleContent)
newArticle.write('\n')
newArticle.close()
except StopIteration:
print "FINISHED, EXTRACTED %s ARTICLES" % i
break
def checkIfMoving(artName,moveList):
for m in moveList:
if artName.startswith(m):
return False
return True
def main():
if len(sys.argv) < 2:
print("Provide wikidump name")
sys.exit(0)
#f = open(sys.argv[-1],'r')
weDict = open('wescienceArticles.py','r')
weScienceDict = cPickle.load(weDict)
moveArticles = ['MediaWiki:','Category:', 'Help:', 'Image:', 'Portal:', 'Template:', 'Wikipedia:']
print "Parsing file %s" % str(sys.argv[-1])
f = codecs.open(sys.argv[-1],'r', "utf-8")
folderName = 'output/'
parseWiki(f,folderName,weScienceDict,moveArticles)
f.close()
if __name__ == '__main__':
main()