User:Mpaa/Sandbox3
Appearance
# -*- coding: utf-8 -*-
import query
import wikipedia as pywikibot
import i18n
import time
import datetime
import sys
import re
import query_ext
class PreloadAuthorPages:
#get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."
def __init__(self, letter=u'Z', getinfo=True):
"""
Constructor. Parameters:
@param letter: The initial of Author pages to get
@getinfo: get also Author info or only page list?
"""
self.letter = letter
self.title=u'Category:Authors-'+self.letter.upper()
self.getinfo=getinfo
def get(self):
#get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."
letterDict = {} #categorymembers of "Category:Authors-X"
# => i.e. subcat "Category:Authors-X..."
catDict = {} #categorymembers of "Category:Authors-X..."
# => i.e. "Authors:Name Xsurname"
#categorymembers of "Category:Authors-X"
catiter_PreloadCat=query_ext._PreloadingCategory(self.title, 500)
for item in catiter_PreloadCat:
print item[u'title']
if item[u'ns']==14:
catDict[item[u'title']]=item[u'title']
letterDict[self.title]=catDict
#categorymembers of "Category:Authors-X..."
cat=catDict[item[u'title']]
catiter_PreloadSubCat=query_ext._PreloadingCategory(catDict[item[u'title']], 500)
subcatDict = {}
for subitem in catiter_PreloadSubCat:
print cat,'SubItem: ' + subitem[u'title']
subcatDict[subitem[u'title']]=subitem[u'title']
letterDict[self.title][cat]=subcatDict
if self.getinfo:
#Get Author info for Authors in Category:Letter and subcategories
for letterDictkey in letterDict[self.title].keys():
subCat=letterDict[self.title][letterDictkey] #subCat now is pure list of Authors w/o Author info
authorsDict={} #Author info dictionary
for author in subCat.keys():
infoAuthor=GetAuthorInfo(author)
info=infoAuthor.get()
if info!=None:
info.update(dict(author=author))
#subCat (list of Authors) is replaced with Author info dictionary
authorsDict[info[u'sortkey']]=info
#subCat (list of Authors) replaced with Author info dictionary
letterDict[self.title][letterDictkey]=authorsDict
return letterDict
class GetAuthorInfo:
#get categorymembers of "Category:Authors-X" and subcat "Category:Authors-X..."
def __init__(self, author, paramAuthors=None):
"""
Constructor. Parameters:
@param author: Author page title
@param paramAuthors: Author template fields
"""
self.author=author
if paramAuthors==None:
self.paramAuthors=[u'lastname',u'firstname',u'birthyear',\
u'deathyear',u'last_initial',u'description',\
u'defaultsort']
else:
self.paramAuthors=paramAuthors
print self.author, paramAuthors, self.paramAuthors
def get(self):
#Get Author info and set sortkey
#Returns None for redirects/not existing pages
author_dict = {}
pagetext=None
page = pywikibot.Page(pywikibot.getSite(), self.author)
if '/' in self.author:
pywikibot.output(u'Warning - page %s is a subpage; skipping.'
% page.title(asLink=True))
else:
page = pywikibot.Page(pywikibot.getSite(), self.author)
pywikibot.output(u'Getting page: %s' % page.title(asLink=True))
pagetext = self.load(page)
if pagetext!=None:
if '{{author' not in pagetext.lower():
pywikibot.output(u'Warning - no template: ' + self.author)
for param in self.paramAuthors:
author_dict[param]=self.get_param_value(param, pagetext)
author_dict[u'sortkey']=self.set_sortkey(author_dict)
else:
author_dict=None
return author_dict
def load(self, page):
'''
Loads the given page, does some changes, and saves it.
'''
try:
# Load the page
text = page.get(throttle=False)
except pywikibot.NoPage:
pywikibot.output(u'Page %s does not exist; skipping.'
% page.title(asLink=True))
except pywikibot.IsRedirectPage:
pywikibot.output(u'Page %s is a redirect; skipping.'
% page.title(asLink=True))
else:
return text
return None
def get_param_value(self, param, wikitext):
#Get param value from author page
#Strip comments as there are cases with multiple comments
if param not in wikitext: return u''
try:
#output=wikitext.split(param)[1].split('\n')[0].split('=')[1].split('|')[0].strip()
output=wikitext.split(param)[1].split('\n')[0].split('=',1)[1].strip()
if '<!--' in output:
if '-->' in output:
output=output.split('<!--')[0]+output.split('-->')[1]
else:
output=output.split('<!--')[0]
output=output.strip()
if '<!--' in output:
if '-->' in output:
output=output.split('<!--')[0]+output.split('-->')[1]
else:
output=output.split('<!--')[0]
output=output.strip()
#remove templates from param
pattern=re.compile(r'{{.*?({{.*)?}}')
output=pattern.sub(u'',output).strip()
pattern=re.compile(r'<ref>.*?</ref>')
output=pattern.sub(u'',output).strip()
return output
except:
return u''
def set_sortkey(self, author_dict):
#to be done: check that keys exist
if author_dict[u'defaultsort']==u'':
if author_dict[u'lastname']==u'':
sortkey=author_dict[u'firstname']
else:
sortkey=author_dict[u'lastname']+', '+author_dict[u'firstname']
else:
sortkey=author_dict[u'defaultsort']
return sortkey
class PutAuthors:
#write output
def __init__(self, letterDict, letter, filename='debug_authors_dict.txt', getinfo=True):
"""
Constructor. Parameters:
@param letterDict: Author dictionary
@param letter: which letter to do
@param filename
@param getinfo: also Author info?
"""
self.letterDict=letterDict
self.letter=letter
self.getinfo=getinfo
self.title=u'Category:Authors-'+letter.upper()
self.filename=filename
self.template=u'{{author index page|'+letter+'}}'
def formatAuthor(self, author):
#format author for output file
authorlink=author[u'author']
last=author[u'lastname']
first=author[u'firstname']
birth=author[u'birthyear']
death=author[u'deathyear']
description=author[u'description']
if description!=u'':
description=', '+description
datestring=u''
if last and first:
namestring = '%s, %s' % (last, first)
elif last and not first: # occasionally users choose to only explicitly specify surname
namestring = '%s' % last
elif first and not last:
namestring = '%s' % first
if birth or death:
ndash=u'\u2013'
datestring = ', (%s %s %s)' % (birth,ndash,death)
line = '*[[%s|%s]]%s%s' % (authorlink,namestring,datestring,description)
return line
def generateText(self):
#page text generation
page_output=[]
page_output.append(self.template+'\n')
for letterDictkey in sorted(self.letterDict[self.title].keys()):
page_output.append((u'=='+unicode(letterDictkey)[-2:]+'=='))
for subCatkey in sorted(self.letterDict[self.title][letterDictkey].keys()):
author=self.letterDict[self.title][letterDictkey][subCatkey]
page_output.append(self.formatAuthor(author))
page_output.append('\n')
page_text="\n".join(page_output)
return page_text
def writeFile(self, page_text):
f=open(self.filename, 'w')
f.write(page_text.encode('utf-8'))
f.close()
def run(self):
page_text = self.generateText()
#page_text=u'pippo'
self.writeFile(page_text)
#site = pywikibot.getSite()
letter=u'z'
paramAuthors=[u'lastname',u'firstname',u'birthyear',u'deathyear',\
u'last_initial',u'description',u'defaultsort']
getinfo=True
filename='debug_authors_dict.txt'
authorDictionary=PreloadAuthorPages(letter)
letterDict=authorDictionary.get()
out2file=PutAuthors(letterDict,letter)
x=out2file.run()