summaryrefslogtreecommitdiff
path: root/converters/html_to_md.py
blob: e3109e18d4abe91c3ea135e11b83006a20b61164 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
from os import walk
import os.path
from markdownify import markdownify

from bs4 import BeautifulSoup

def wrap_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    for li in soup.find_all('li'):
        a_tags = li.find_all('a')
        if len(a_tags) == 1:
            a = a_tags[0]
            full_text = li.get_text(' ', strip=True)
            a.string = full_text
            li.clear()
            li.append(a)
        elif len(a_tags) > 1:
#            new_tags_text = [f"<li>{line}</li>" for line in li.decode_contents().split('\n')]
#            new_tags = BeautifulSoup(''.join(new_tags_text), 'html.parser')
#            li.replace_with(new_tags)
            pass
    return str(soup)


def convert_html_to_md(HtmlList):
    for path in HtmlList:

        pathsplit = path.split("/")

        file = open(str(path), "r").read()
        file = wrap_links(file)
        html = markdownify(file, heading_style="ATX")
        f = open("output/markdown/" + str(pathsplit[-1]).replace(".html", ".md"), "w")
        f.write(html)
        f.close()



    return "html was converted to markdown (1/3)"