import urllib.request from bs4 import BeautifulSoup def _get_html(url): with urllib.request.urlopen(url) as response: html = response.read() return html.decode("utf-8") def extract_content(html): soup = BeautifulSoup(html, "html.parser") return soup.find_all("div", class_="lyricbox")[0].contents def clean_content(contents): final_content = "" for e in contents: if e == "\n": continue if e.name == "script": continue if e.name == "br": final_content += "\n" continue try: final_content += e.text except AttributeError: final_content += str(e) return final_content