#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# viquitext-0.1.py
#
u"""
Author: Anskar
Look for me in catalan wikipedia
Script to find new .djvu in commons and get to wikisource
Global Variables:
Category to find new .djvu i commons
site_comm = pwb.Site("wikimedia", "commons")
cat_comm = u'Category:DjVu files in Catalan'
File to save the last timestamp and get pagegenerators beggining from that
formed in call save_obj(_file, obj)
How to get wikisouce template
print some_book_page.get() and copy manually the content to make ws_tmplt
Format to ws_tmplt
getting commons {{Book}} template and put params as values in ws_template
dict(re_info) construct the key, value pairs to put in ws_template.format(**dict) important **
and set the text to save in "{{%s}}" % ws_tmplt
Catalan example:
ws_template = u'''
:MediaWiki:Proofreadpage_index_template -- Name ws_tmplt it exists as [[MediaWiki:Proofreadpage_index_template]] page
|Títol=''{Title}'' -- Títol is param ws_tmplt, {Title} would be param commons {{Book}} template
|Autor={Author} -- so construct dict in form
|Editor={Publisher} -- data[comm_param] = comm_value
|Lloc={City} -- data["City"] = "Barcelona"
|Any={Date} -- would be "Lloc=Barcelona" in text to save
|Font=[[:commons:{Image}|Commons]] -- As can see, not always common param is the same wikisource param
|Imatge=1
|Pàgines=<pagelist/>
|Sumari=
|Nivell= I'''
comm_data = re.findall(ur".*?[|] ?(\w+) *= ?(.+)\n", comm_page.get()) to get commons data
data = ws_tmplt.format(**dict(comm_data))
ws_page.put("{{%s}}" % data)
!¡!¡!¡!¡!¡!¡!¡!¡!
!!! IMPORTANT ¡¡¡
!¡!¡!¡!¡!¡!¡!¡!¡!
* All pages created would be rewiewed
* All commons pages that not use {{Book}} template don't success satisfactorily
* If find {{Information}} template, put in title ws_tmplt param all commons description
* Others kind of data will save page name in bot page and do nothing
* All pwb.inputs are to run in test mode put test = False to run without polls
* Have a limit variable to test low number of pages
* If you find better code to run, please, share it.
!¡!¡!¡!¡!¡!¡!¡!¡!
!!! IMPORTANT ¡¡¡
!¡!¡!¡!¡!¡!¡!¡!¡!
!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!!!!!
!!! MORE GREATEST IMPORTANT ¡¡¡
!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!!!!!
Enjoy it, be fun ;)
"""
import sys, os
import datetime
import traceback
import codecs
import bz2, pickle
import re
from collections import Counter
import wikipedia as pwb
from query import GetData
def backtrace_error(func):
"""Function to traceback errors and don't out script"""
err = traceback.format_exc()
output(err)
def dont_uploaded(page):
"""Put in bot page all pages not uploaded"""
errors_page_text = errors_page.get()
errors_page_text += u"""
== No s'ha pogut carregat la pàgina ==
El llibre [[:commons:%s]] no s'ha pogut carregar automàticament perque la pàgina de commons no té plantilles processables --~~~~
""" % page
errors_page.put(errors_page_text, comment=u"Error d'Anskarbot")
def save_obj(file, obj, test=False):
"""Save all objects to find info.
Objects can't be strings, would be list or dict"""
if not test:
f = bz2.BZ2File("temp/.%s.bin" % file, 'w')
else:
f = bz2.BZ2File("temp/.%s_test.bin" % file, 'w')
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
def read_obj(file, test=False):
"""Read file to get objects to find info"""
if not test:
f = bz2.BZ2File("temp/.%s.bin" % file, 'r')
else:
f = bz2.BZ2File("temp/.%s_test.bin" % file, 'r')
obj = pickle.load(f)
f.close()
return obj
def output(msg, bot_op=True):
"""Create a log file with current prints
bot_op: boolean to print msg in console"""
try:
unicode(msg)
except:
msg = msg
try:
with codecs.open("./temp/%s.txt" % log, "a", "utf-8") as f:
f.write(u"%s\n" % msg)
except:
pass
if bot_op:
pwb.output(msg)
else:
print "done", unicode(msg)[:25]
def find_author(comm_author):
author = [a for a in gen_autors for b in comm_author.split(" ") if b not in ("i", "y", "de") and b in a]
counter = dict(Counter(author))
author_p = []
for author_f in author:
print comm_author
print counter.items()
if counter and sorted(counter.items())[0][1] > 1:
author_t = sorted(counter.items())[0][0]
else:
author_t = "[[Autor:%s|%s]]" % (comm_author,comm_author)
print author_t
if test:
pwb.input("")
return author_t
def get_data(comm_text, page, j, book=True):
global timestamp
data = dict(re_info.findall(comm_text))
print data
if not book:
for key in ws_keys:
if key not in data:
data[key] = ""
data["Title"] = re.findall(ur"\{{2}.*?=(.+)\}{2}", data["Description"])[0]
else:
data["Title"] = "[[%s]]" % data["Title"]
data["Image"] = unicode(page)[10:]
if data["Author"]:
data["Author"] = find_author(data["Author"])
text = "{{%s}}" % ws_tmplt.format(**data)
if test:
resp = pwb.input("desem timestamp:\n[y|N]\n")
if resp in ("s", "si", "y"):
save_obj("viquitext_timestamp", [timestamp[j]])
return text
def main():
list_comm = []
n = 1
j = -1
for page in gen_djvu:
j += 1
titol_comm = page.title()
pag_wt = pwb.Page(site, u"Llibre:%s" % titol_comm[5:])
if not pag_wt.exists():
try:
comm_text = page.get()
print comm_text
if "== {{int:filedesc}} ==\n{{Book" in comm_text:
text_final = get_data(comm_text, page, j)
elif "{{Information\n|Description=" in comm_text:
text_final = get_data(comm_text, page, j, book=False)
else:
dont_uploaded(page)
continue
output(text_final)
if test:
resp = pwb.input("Desem\n[y/N]")
if resp in ("s", "si", "y"):
pag_wt.put(text_final, comment="Proves de BOT")
else:
pag_wt.put(text_final, comment="Proves de BOT")
#llista_comm.append(page.title())
if limit and n >= limit:
break
n += 1
except:
backtrace_error(pag_wt)
else:
print "ja el tenim"
if test:
try:
comm_text = page.get()
print comm_text
if "== {{int:filedesc}} ==\n{{Book" in comm_text:
text_final = get_data(comm_text, page, j)
elif "{{Information\n|Description=" in comm_text:
text_final = get_data(comm_text, page, j, book=False)
output(text_final)
except:
backtrace_error(pag_wt)
# save_obj("djvu_comm", llista_comm)
return 0
def pages_comm(cats):
"""Create all pages from commons category"""
global site_comm, timestamp
pages = []
if isinstance(cats, unicode): # if only have a category must be listed
cats = [cats]
if test:
timestamp = ["2010-11-07T18:07:23Z"] # default timestamp to begins look for, overrides for others languages
else:
timestamp = read_obj("viquitext_timestamp") # read last timestamp used
for cat in cats:
print cat
params = {
"action":"query",
"list":"categorymembers",
"cmtitle" : cat,
"cmlimit" : 500,
"cmprop" : "title|timestamp",
"cmsort" : "timestamp",
"cmstart" : timestamp[0]
}
data = GetData(params, site=site_comm)
for x in data["query"]["categorymembers"]:
if x["ns"] == 14:
cats.append(x["title"])
elif x["ns"] == 6:
pages.append(x["title"])
timestamp.append(x["timestamp"])
while "query-continue" in data:
params["cmcontinue"] = data["query-continue"]["categorymembers"]["cmcontinue"]
data0 = GetData(params, site=site_comm)
for x in data0["query"]["categorymembers"]:
if x["ns"] == 14:
cats.append(x["title"])
elif x["ns"] == 6:
pages.append(x["title"])
timestamp.append(x["timestamp"])
data = data0
if not test:
save_obj("viquitext_timestamp", [timestamp[-1]])
for page in pages:
print page
yield pwb.Page(site_comm, page)
def authors():
"""
Create all pages from Author namespaces
In catalan wikisource Autor namespace is 106
In english wikisource Author namespace is 102
Find apropiate namespace from other languages in
(put your language in <lang> and look for exist <Author:page> in titles)
https://<lang>.wikisource.org/wiki/Special:ApiSandbox#action=query&prop=info&format=json
"""
global site
pages = []
params = {
"action":"query",
"list":"allpages",
"aplimit" : 500,
"apnamespace" : 106
}
data = GetData(params, site=site)
for x in data["query"]["allpages"]:
pages.append(x["title"])
while "query-continue" in data:
params["cmcontinue"] = data["query-continue"]["allpages"]["cmcontinue"]
data0 = GetData(site=site_comm, **params).submit()
for x in data0["query"]["allpages"]:
pages.append(x["title"])
data = data0
return pages
if __name__ == '__main__':
timestamp = []
test = True
ws_tmplt = u""":MediaWiki:Proofreadpage_index_template
|Títol=''{Title}''
|Autor={Author}
|Editor={Publisher}
|Lloc={City}
|Any={Date}
|Font=[[:commons:{Image}|Commons]]
|Imatge=1
|Pàgines=<pagelist/>
|Sumari=
|Nivell= I
"""
ws_keys = ("Title", "Author", "Publisher", "City", "Date", "Image")
log = "carrega-viquitext"
if os.path.exists("/data"):
site = pwb.Site("ca", "wikisource_ba")
else:
site = pwb.Site("ca", "wikisource")
page = u"Usuari:Anskarbot/proves"
errors_page = pwb.Page(site, page)
limit = None
try:
with codecs.open("./temp/%s.txt" % log, "w", "utf-8") as f:
f.write(unicode(datetime.datetime.now()))
# re_info = re.compile(r"\{{2}Information.*.\}{2}", re.DOTALL)
re_info = re.compile(ur".*?[|] ?(\w+) *= ?(.+)\n")
site_comm = pwb.Site("wikimedia", "commons")
cat_comm = u'Category:DjVu files in Catalan'
gen_djvu = pages_comm(cat_comm)
gen_autors = authors()
main()
except:
backtrace_error(__name__)
finally:
pwb.stopme()