User:Inductiveload/Scripts/Page concatenator
Appearance
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# untitled.py
#
# Copyright 2010 Inductiveload
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
import pw_script_header
import wikipedia
import codecs
import re
FILELIST = "/home/john/src/pw/zz_gendata.txt"
DESTINATION = 'User:Inductiveload/Sandbox7'
OUTFILE = '/tmp/concatfile.txt'
def find_matching_braces(text, first_brace_index ):
"""finds the index of the matching right brace to a left brace"""
lbrace = text[first_brace_index]
if lbrace == '{':
rbrace = '}'
elif lbrace == '[':
rbrace = ']'
elif lbrace == '(':
rbrace = ')'
elif lbrace == '<':
rbrace == '>'
else:
print "(ERR) not a brace"
# brace counter
count = 0
char_number = first_brace_index
for char in text[first_brace_index:]:
if char == lbrace:
count += 1
elif char == rbrace:
count -= 1
elif count == 0:
break
char_number += 1
if count != 0:
print "(ERR) unbalanced brackets"
return None
else:
return char_number
def find_header(wikitext):
m = re.search(r'({{\s*header)', wikitext)
if m:
header_start = wikitext.find(m.group(1))
print "(INF) Header found, starting at %d" % header_start
else:
print "(ERR) Header not found."
return 0, 0
header_end = find_matching_braces(wikitext, header_start)
print "(INF) header ends at char: %d" % header_end
header = wikitext[header_start:header_end]
#print "\n", header, "\n"
return header_start, header_end
def main():
filelist = codecs.open(FILELIST, 'r', 'utf-8')
site = wikipedia.getSite()
tempfile = codecs.open('/tmp/wikt.txt', 'w', 'utf-8')
text = ''
for line in filelist:
print "(INF) processing page: %s" % line.strip()
page = wikipedia.Page(site, line.strip())
newtext = page.get()
start, end = find_header(newtext)
newtext = newtext[end:]
newtext = re.sub("===", "", newtext)
newtext = re.sub("==", "", newtext)
text += newtext
tempfile.write(text)
tempfile.close()
print "(INF) Saving text to %s" % OUTFILE
outfile = codecs.open(OUTFILE, 'w', 'utf-8')
outfile.write(text)
outfile.close()
print "(INF) Uploading concatenated pages to %s." % DESTINATION
cont = raw_input("continue? [y/n]")
if cont == 'y':
page = wikipedia.Page(site, DESTINATION)
page.put(text, 'Uploading concatenated pages to %s.'% DESTINATION, minorEdit=False)
return 0
if __name__ == '__main__':
main()