Copy the link and turn the HTML of the body into markdown
As I am very inefficient, sometimes it is inevitable to be urged more ha
As we all know, laziness is the motivation for people to innovate
So I wrote a copy of the crawler script, I see feel good articles, directly climb over, and then use the wechat public number format to share with you ~
(Mainly because some articles are rich in gold, but ugly, just like you and me)
Of course, THE shared content I will not mark the original, as my personal favorites, will also indicate the link to reprint
role
Convert regular blogs, through crawlers and character processing, tomarkdown
Formatted text
There are similar tools online, including the more famous python third-party libraries html2text, also have some problems more or less, try once, http://www.atoolbox.net/Tool.php?Id=715 is the most accurate
So I wrote one myself
usage
python3 blog2md.py
Copy the code
After entering the link that you want to copy and collect
The terminal will then output the converted result in MD format, and in the same directory, blog.md will be generated
demo
- Crawl content is from my last original article: [
Python crawler – automatically gets the title of the reference link – into markdown format](blog.csdn.net/Mxdon_on/ar…)
- Console output
As you can see, we have almost all the formatting we need
- The file preview
[@path/to/blog2md.py] $ ls
blog2md.py blog.md
Copy the code
At this point, the blog.md file has been generated in the same directory. Let’s open it for a preview and see what it looks like
It’s exactly the same as the original I wrote myself
Completed function
- Solve bold, strikeout, italics and other text
markdown
style - Solve images and links
markdown
According to - Solve the multi-level title and page title selection, as well as directory
toc
join - Resolve conflicts between in-line code and block code display embedded in articles, supporting in-block code in different languages
- Support zhihu articles,
csdn
Articles, short books and blog garden articles, others have not yet been tested - Supports the command line interaction mode
TODO
- Multi-level references are not yet tested by different
markdown
The rendering tool decided that there would be a problem - In-block code can be a bit of a problem in some outrageous formats, but it doesn’t affect usage for now
- Formula not supported
- It’s not packaged graphically
- Multi-level lists are also a bit problematic
- Some pictures of the website do not support reference, such as some pictures of CSDN, which may need to be decoded first
- Should be the final generated document, according to the title of the name to regenerate the file, ah a line of code, do not want to do, enough for my own use
- Something magical
An empty div
Label is outrageous, a label nearly 100 lines, and then it will be a bit of a problem, fortunately, not much, directly delete it first - Tables not supported
- There’s a little bit of space at the beginning, not yet
Markdown is still rendered as HTML
code
I thought it might be useful, so I threw it in Gitee. If you need it, you can change it and use it together
Link: https://gitee.com/mxdon/funny_repo/tree/master/Convert_html_2_markdown
But still put the source code, do not be too long ha (see not all can slide around) :
# get title from title
# get content from article
# finish requests while article
# difficult @ filter out the correct content from so many ...
# and format different content.
import re
import requests
import time
import random
import urllib3
from bs4 import BeautifulSoup
#def getimg(link):
def run(filename,textString) :
src = open(filename,'a')
# Solved
# 1. bold
# 2. img and figcaption
# 3. title and contents
# 4. italics
# 5. code
# 6. nonsense tags
# 7. quote ### multi_quote TODO
# 8. delete
# 9. toc
# 10.link
# 11.p
# 12.add toc
# 13.formula ### TODO
toc_flag = 0
i = 0
l = The '-'
quote_flag = 0
code_flag = 0
title_flag = 0
#for line in src.readlines():
for line in textString.split('\n') :if len(line) ==0:
continue
templine = line
#### replace figcaption
if re.search('< /? (? <=figcaption)[^<]*>',templine):
continue
#### replace [toc]
if re.search('(? <=
'
)[^,templine):
toc_flag = toc_flag + 1
elif re.search('</div>',line):
toc_flag = toc_flag - 1
if toc_flag:
continue
templine = re.sub('<code\s[^<]*inline[^<]*>'.'<code>', templine)
code_inline = re.search('(? <=).*(? =
)',templine)
if code_inline:
#print('===========' + str(code_inline.group()))
templine = re.sub('< /? code>'.'`, templine)
if re.search('
'
\s[^,templine):
code_flag = 1
codetype = re.search('(? <=language-)([^\"]+?) (? = \ ") ',templine)
if codetype:
#titlelevel = re.search('(? <=
$)',templine)
)([0-9])(?>
#print(titlelevel.group())
#print(codetype.group())
templine = re.sub('
'
\s[^.'` ` `'+str(codetype.group())+"\n",templine).strip()
else:
templine = re.sub('
'
\s[^.'```\n',templine).strip()
templine = re.sub('< /? code>'.'\n```',templine).strip()
#### replace image
try:
full_img = re.search('(? <=
'
).*(?>,templine)
#print(full_img.group())
img_name_group = re.search('(? <=alt=\")[^\"]+? (? = \ ") '.str(full_img.group()))
img_group = re.search('(? <=src=\")([^\"]+?) (? = \ ") '.str(full_img.group()))
#print(img.group())
#print(img_name.group())
img_name = str(img_name_group.group())
img = str(img_group.group())
if len(img_name) <= 1 or img_name.startswith('im'):
img_name = ' '
else:
img_name = str(img_name_group.group())
templine = re.sub('
'
[^.'\n! ['+str(img_name)+'] ('+str(img)+')\n',templine).strip()
except:
templine = re.sub('
'
[^.'! [load img faild]()',templine).strip()
#templine = re.sub('
', '! ['+str(img_name)+']('+str(img)+')',templine).strip()
[^
#### replace bold
templine = re.sub('< /? strong[^<]*>'.'* *',templine).strip()
templine = re.sub('< /? (b>|b\s[^<]*>)'.'* *',templine).strip() # conflict with blockquote
templine = re.sub('< /? em[^<]*>'.'* *',templine).strip()
#### replace italics
templine = re.sub('< /? i[^<]*>'.The '*',templine).strip()
#### replace delete
templine = re.sub('< /? del[^<]*>'.'~ ~',templine).strip()
templine = re.sub('< /? s>'.'~ ~',templine).strip()
#### replace list
#if re.search('<ol[^<]*>',templine):
# i = 1
#elif re.search('</ol[^<]*>',templine):
# i = 0
# l = '- '
ifi ! =0:
if re.search('<li>',templine):
l = str(i) + '. '
templine = re.sub('<li>'.str(i) + '. ',templine).strip()
i = i + 1
templine = re.sub('<li>',l,templine).strip()
templine = re.sub('</li>'.'\n',templine).strip()
#### replace link
try:
full_link_group = re.search('(? <=
)'
)[^,templine)
full_link = str(full_link_group.group())
link_href_group = re.search('(? <=href=\").*(? = \ ") ',full_link)
link_href = str(link_href_group.group())
#print(link_href)
link_text_group = re.search('(? < = >). * ',full_link)
link_text = str(link_text_group.group())
#print(link_text)
templine = re.sub('
]+? >.*'
[^>.'['+link_text+'] ('+link_href+') ',templine).strip()
except:
templine = templine
#### replace p
#lineval = re.sub('<p[^<]*>','',templine).strip()
#if re.search('</p[^<]*>',templine):
#print("")
templine = re.sub('<p[^<]*>'.' ',templine).strip()
templine = re.sub('</p[^<]*>'.'\n',templine).strip()+'\n'
#### 1. replace quote
if re.search('<blockquote[^<]*>',templine):
quote_flag = quote_flag + 1
print(">",end="")
#templine = re.sub('
', '> ', templine).strip()
[^
templine = re.sub('<blockquote[^<]*>'.' ', templine).strip()
src.write('>')
elif re.search('</blockquote[^<]*>',templine):
quote_flag = quote_flag - 1
templine = re.sub('</blockquote[^<]*>'.' ', templine).strip()
if quote_flag:
print(templine,end="")
src.write(templine)
continue
#### replace different level title
#templine = re.sub('<h1[^<]*>','# ',templine).strip()
templine = re.sub('<h1[^<]*>'.'## ',templine).strip()
templine = re.sub('<h2[^<]*>'.'## ',templine).strip()
templine = re.sub('<h3[^<]*>'.'# # #',templine).strip()
templine = re.sub('<h4[^<]*>'.'# # # #',templine).strip()
templine = re.sub('<h5[^<]*>'.'# # # # #',templine).strip()
templine = re.sub('<h6[^<]*>'.'# # # # # #',templine).strip()
templine = re.sub('</h[0-9].*>'.' ',templine).strip()
if re.search('< /? title[^<]*>',templine):
templine = re.sub('< /? title[^<]*>'.' ',templine).strip()
#### replace all tags
templine = re.sub(+ '< [^ <]? > '.' ', templine).strip()
print(The '#'+templine+'\n[toc]\n')
src.write(The '#'+templine+'\n[toc]\n')
continue
#### replace all tags
templine = re.sub('< /? code>'.'`, templine).strip()
templine = re.sub(+ '< [^ <]? > '.' ', templine).strip()
lineval = templine + '\n'
print(lineval)
if len(lineval) == 0:
continue
else:
src.write(lineval+'\n')
src.close()
if __name__ == '__main__':
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
heads = {'User-Agent':'the Mozilla / 5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
des = open("blog.md"."w")
des.write(' ')
des.close()
#des = open('blog.md','a')
url = input("Please input a legal URL: ")
while not url.startswith('http'):
url = input("Please input a legal URL again : \n")
try:
res = requests.get(url,timeout=30,headers=heads,verify=False)
res.encoding = 'UTF-8'
content = res.text
#print(content)
soup = BeautifulSoup(content,'html.parser')
if 'zhihu.com' in url:
temp_article = soup.find('div',class_='Post-RichTextContainer')
elif 'csdn.net' in url:
temp_article = soup.article
elif 'cnblogs.com' in url:
temp_article = soup.find('div',class_='markdown-here-wrapper')
article = str(temp_article)
except:
print("request error")
exit(1)
try:
title = soup.find('title')
target_head = str(soup.title)
#print("%s\n" % ("# "+title.text+"\n"+"[toc]"))
#title_text = '# '+title.text+'\n[toc]\n'
#des.write("# "+title.text+"\n"+"[toc]\n")
except:
print("===== title not found =====")
h1 = soup.find('h1')
target_head = str(soup.h1)
#print("%s\n" % ("#"+h1.text+"\n"+"[toc]"))
#des.write("# "+h1.text+"\n"+"[toc]\n")
#run(article)
target_text = target_head + article
#print(target_text)
#des.write(article)
#des.close()
run('blog.md',target_text)
#run('blog.md')
Copy the code