Skip to content

Commit

Permalink
Rewrite lyrics integration tests
Browse files Browse the repository at this point in the history
  • Loading branch information
snejus committed Sep 4, 2024
1 parent 1b26aa6 commit 7c8eaec
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 286 deletions.
34 changes: 7 additions & 27 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import json
import os.path
import re
import struct
import unicodedata
import urllib
import warnings
from html import unescape
from typing import Any

import requests
Expand Down Expand Up @@ -111,27 +111,6 @@
# Utilities.


def unichar(i):
try:
return chr(i)
except ValueError:
return struct.pack("i", i).decode("utf-32")


def unescape(text):
"""Resolve &#xxx; HTML entities (and some others)."""
if isinstance(text, bytes):
text = text.decode("utf-8", "ignore")
out = text.replace(" ", " ")

def replchar(m):
num = m.group(1)
return unichar(int(num))

out = re.sub("&#(\\d+);", replchar, out)
return out


def extract_text_between(html, start_marker, end_marker):
try:
_, html = html.split(start_marker, 1)
Expand Down Expand Up @@ -659,6 +638,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'.
html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags.
html = re.sub("\u2005", " ", html) # replace unicode with regular space
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove bold / italics

if plain_text_out: # Strip remaining HTML tags
html = COMMENT_RE.sub("", html)
Expand All @@ -674,10 +655,12 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"<div .*>\s*</div>", "\n", html)


def scrape_lyrics_from_html(html):
def scrape_lyrics_from_html(html: str | None) -> str | None:
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
"""
if not html:
return None

def is_text_notcode(text):
if not text:
Expand Down Expand Up @@ -819,10 +802,7 @@ def fetch(self, artist, title, album=None, length=None):
url_link, url_title, title, artist
):
continue
html = self.fetch_url(url_link)
if not html:
continue
lyrics = scrape_lyrics_from_html(html)
lyrics = scrape_lyrics_from_html(self.fetch_url(url_link))
if not lyrics:
continue

Expand Down
Loading

0 comments on commit 7c8eaec

Please sign in to comment.