User:Alex brollo/voclad.py

##!/usr/bin/python
## -*- coding: utf-8  -*-

##Test elabotazione di pdftohtml (output html, xml, txt)

import os                 # funzioni generiche sui file e comandi di sistema
import bs4 
import re
import pywikibot as bot
# from BrolloBot3 import find_stringa, produci_lista
import unicodedata as ud
mul=bot.Site("mul","wikisource")
it=bot.Site("it","wikisource")

basePagina='<noinclude><pagequality level="1" user="BrolloBot" />'\
            +'{{tst|vdll}}{{rh|%s|%d|%s}}\n----</noinclude>%s<noinclude></noinclude>'

def rebuild(linee):
    testo=""
    for i in range(len(linee)):
        if i!=0:
            if int(linee[i-1]["top"])<int(linee[i]["top"]) and \
               int(linee[i-1]["left"])>=int(linee[i]["left"]):
                testo+="\n"
            elif int(linee[i-1]["top"])-int(linee[i]["top"])>100:
                testo+="\n"
        #testo+=str(linee[i].contents[0])
        linea=str(linee[i]).replace("</text>","")
        linea=linea[linea.find(">")+1:]
        testo+=linea
                    
                                    
    testo=testo.replace("<b>","<rh>",2).replace("</b>","</rh>\n",2)
    return testo

def cleanup(linea):
    linea=str(linea)
    linea=linea.replace("</text>","")[linea.find(">")+1:]
    return linea
            
        
def xml2html(xml):
    r=re.compile(r"\n+<b>")
    r1=re.compile(r"\n +")
    r2=re.compile(r"\n\n<b>(.+?) </b>")
    xml=bs4.BeautifulSoup(xml,"lxml")
    linee=xml.find_all("text")
    html=rebuild(linee)
    open("pagina.html","w",encoding="utf-8").write(html)
    html=r.sub("\n\n<b>",html)
    html=r1.sub("<br>\n",html)
    html=html.replace("</i><i>","")\
      .replace("</b><b>","")\
      .replace("</i>\n<i>","\n")\
      .replace("</b>\n<b>","\n")\
      .replace("-\n","")
    html=r2.sub(r"\n\n<b>{{lemma|\1|l}} </b>",html)
    
    return html

def rh(testo,pagina):
    pag=pagina-35
    #testo=testo.replace("#left","").replace("#right","")
    b=produci_lista(testo,"<rh>","</rh>",1)
    print(str(b[:6]))
    h1=find_stringa(b[0],"<rh>","</rh>",0)
    h2=find_stringa(b[1],"<rh>","</rh>",0)
    testo=testo.replace(b[0]+"\n","")\
           .replace(b[0]+"<br>\n","")\
           .replace(b[1]+"\n","")\
           .replace(b[1]+"<br>\n","")
    if h1.isdigit():
        h1=h2
    if pag % 2 == 0: #pagina pari
        testo=testo.replace("#left",h1)
    else:
        testo=testo.replace("#right",h1)
    testo=testo.replace("#right","").replace("#left","")
    testo=ud.normalize("NFC",testo)
    return testo

def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)
                
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista


def getXml(pagina, scrivi=True):
    comando=f"pdftohtml -xml ../pdf/{pagina}_PDFsam_Vocabolardlladinleterar.pdf pagina.xml"

    os.system(comando)
    xml=open("pagina.xml", encoding="utf-8").read()
    testo=xml2html(xml)
    testo=basePagina % ("#left",pagina-35,"#right",testo)
    
    testo=rh(testo,pagina)
    open("pagina.txt","w",encoding="utf-8").write(testo)
    if scrivi:
        bot.Page(mul,"Page:Vocabolardlladinleterar.pdf/"+str(pagina)).put(testo)
    return testo