/* UtilLib_Html - Copyright (c) 2017-2025 Vincent Calame - Exemole
 * Logiciel libre donné sous triple licence :
 * 1) selon les termes de la CeCILL V2
 * 2) selon les termes de l’EUPL V.1.1
 * 3) selon les termes de la GNU GPLv3
 * Voir le fichier licences.txt
 */


package net.mapeadores.util.html.jsoup;

import net.mapeadores.util.html.HtmlCleaner;
import net.mapeadores.util.html.TrustedHtml;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Safelist;


/**
 *
 * @author Vincent Calame
 */
public final class HtmlCleaners {

    public final static HtmlCleaner WELLFORMED = new WellformedTrustedHtmlFactory();
    public final static HtmlCleaner TEXT_ONLY = new TextOnlyHtmlCleaner();
    public final static HtmlCleaner SIMPLE_TEXT = new SafelistHtmlCleaner(Safelist.simpleText());
    public final static HtmlCleaner BASIC = new SafelistHtmlCleaner(Safelist.basic());
    public final static HtmlCleaner BASIC_WITH_IMAGES = new SafelistHtmlCleaner(Safelist.basicWithImages());
    public final static HtmlCleaner RELAXED = new SafelistHtmlCleaner(Safelist.relaxed());
    public final static HtmlCleaner EXTENDED = new SafelistHtmlCleaner(extended());
    public final static HtmlCleaner EXTENDED_WITH_STYLE = new SafelistHtmlCleaner(extendedWithStyle());
    public final static HtmlCleaner EXTENDED_WITH_DATA = new SafelistHtmlCleaner(extendedWithStyle());
    public final static HtmlCleaner IMPORTATION = new SafelistHtmlCleaner(importation());

    private HtmlCleaners() {

    }


    public static String textOnly(String html) {
        return TEXT_ONLY.cleanHtml(html).toString();
    }

    public static Safelist extended() {
        return Safelist.relaxed()
                .addTags("hr", "iframe", "video", "audio", "source")
                .addAttributes("iframe", "src", "name", "allowfullscreen", "height", "width", "frameborder", "scrolling", "marginwidth", "marginheight")
                .addAttributes("video", "src", "crossorigin", "poster", "preload", "autoplay", "mediagroup", "loop", "muted", "controls", "height", "width")
                .addAttributes("audio", "src", "crossorigin", "preload", "autoplay", "mediagroup", "loop", "controls")
                .addAttributes("source", "src", "type")
                .addAttributes("a", "target", "rel")
                .addProtocols("iframe", "src", "http", "https")
                .addProtocols("video", "src", "http", "https")
                .addProtocols("audio", "src", "http", "https")
                .addProtocols("source", "src", "http", "https")
                .addAttributes(":all", "id", "class", "lang")
                .addAttributes(":all", "id", "class", "lang");
    }

    public static Safelist extendedWithStyle() {
        return extended()
                .addAttributes(":all", "style");
    }

    public static Safelist importation() {
        return extendedWithStyle()
                .addAttributes(":all", "data-bdf-type", "data-bdf-ref");
    }


    private static class WellformedTrustedHtmlFactory implements HtmlCleaner {

        private WellformedTrustedHtmlFactory() {

        }

        @Override
        public TrustedHtml cleanHtml(String html) {
            Document doc = Jsoup.parseBodyFragment(html);
            return new TrustedHtml(doc.body().html());
        }

    }


    private static class TextOnlyHtmlCleaner implements HtmlCleaner {

        private TextOnlyHtmlCleaner() {

        }

        @Override
        public TrustedHtml cleanHtml(String html) {
            Cleaner cleaner = new Cleaner(Safelist.none());
            return new TrustedHtml(cleaner.clean(Jsoup.parse(html)).text());
        }

    }


    private static class SafelistHtmlCleaner implements HtmlCleaner {

        private final Safelist safeList;

        private SafelistHtmlCleaner(Safelist safelist) {
            this.safeList = safelist;
        }

        @Override
        public TrustedHtml cleanHtml(String html) {
            return new TrustedHtml(Jsoup.clean(html, safeList));

        }

    }

}
