##!/usr/bin/python
## -*- coding: utf-8 -*-
##Test elabotazione di pdftohtml (output html, xml, txt)
import os # funzioni generiche sui file e comandi di sistema
import bs4
import re
import pywikibot as bot
# from BrolloBot3 import find_stringa, produci_lista
import unicodedata as ud
mul=bot.Site("mul","wikisource")
it=bot.Site("it","wikisource")
basePagina='<noinclude><pagequality level="1" user="BrolloBot" />'\
+'{{tst|vdll}}{{rh|%s|%d|%s}}\n----</noinclude>%s<noinclude></noinclude>'
def rebuild(linee):
testo=""
for i in range(len(linee)):
if i!=0:
if int(linee[i-1]["top"])<int(linee[i]["top"]) and \
int(linee[i-1]["left"])>=int(linee[i]["left"]):
testo+="\n"
elif int(linee[i-1]["top"])-int(linee[i]["top"])>100:
testo+="\n"
#testo+=str(linee[i].contents[0])
linea=str(linee[i]).replace("</text>","")
linea=linea[linea.find(">")+1:]
testo+=linea
testo=testo.replace("<b>","<rh>",2).replace("</b>","</rh>\n",2)
return testo
def cleanup(linea):
linea=str(linea)
linea=linea.replace("</text>","")[linea.find(">")+1:]
return linea
def xml2html(xml):
r=re.compile(r"\n+<b>")
r1=re.compile(r"\n +")
r2=re.compile(r"\n\n<b>(.+?) </b>")
xml=bs4.BeautifulSoup(xml,"lxml")
linee=xml.find_all("text")
html=rebuild(linee)
open("pagina.html","w",encoding="utf-8").write(html)
html=r.sub("\n\n<b>",html)
html=r1.sub("<br>\n",html)
html=html.replace("</i><i>","")\
.replace("</b><b>","")\
.replace("</i>\n<i>","\n")\
.replace("</b>\n<b>","\n")\
.replace("-\n","")
html=r2.sub(r"\n\n<b>{{lemma|\1|l}} </b>",html)
return html
def rh(testo,pagina):
pag=pagina-35
#testo=testo.replace("#left","").replace("#right","")
b=produci_lista(testo,"<rh>","</rh>",1)
print(str(b[:6]))
h1=find_stringa(b[0],"<rh>","</rh>",0)
h2=find_stringa(b[1],"<rh>","</rh>",0)
testo=testo.replace(b[0]+"\n","")\
.replace(b[0]+"<br>\n","")\
.replace(b[1]+"\n","")\
.replace(b[1]+"<br>\n","")
if h1.isdigit():
h1=h2
if pag % 2 == 0: #pagina pari
testo=testo.replace("#left",h1)
else:
testo=testo.replace("#right",h1)
testo=testo.replace("#right","").replace("#left","")
testo=ud.normalize("NFC",testo)
return testo
def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
if side=="right":
idip=stringa.rfind(idi)
else:
idip=stringa.find(idi)
idfp=stringa.find(idf,idip+len(idi))+len(idf)
if idip>-1 and idfp>0:
if x!=None:
while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
idfp=stringa.find(idf,idfp)+len(idf)
if dc==0:
vvalore=stringa[idip+len(idi):idfp-len(idf)]
else:
vvalore=stringa[idip:idfp]
else:
vvalore=""
return vvalore
def produci_lista(testo,idi,idf,dc=1,inizio=None):
t=testo[:]
lista=[]
while not find_stringa(t,idi,idf,1,inizio)=="":
el=find_stringa(t,idi,idf,1,inizio)
t=t.replace(el,"",1)
if dc==0:
el=find_stringa(el,idi,idf,0,inizio)
lista.append(el)
return lista
def getXml(pagina, scrivi=True):
comando=f"pdftohtml -xml ../pdf/{pagina}_PDFsam_Vocabolardlladinleterar.pdf pagina.xml"
os.system(comando)
xml=open("pagina.xml", encoding="utf-8").read()
testo=xml2html(xml)
testo=basePagina % ("#left",pagina-35,"#right",testo)
testo=rh(testo,pagina)
open("pagina.txt","w",encoding="utf-8").write(testo)
if scrivi:
bot.Page(mul,"Page:Vocabolardlladinleterar.pdf/"+str(pagina)).put(testo)
return testo