Unescape HTML titles in archive script

This commit is contained in:
David Eisinger
2024-01-30 10:27:00 -05:00
parent fd86fe4bca
commit 91b2863e34

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require "cgi"
require "uri" require "uri"
require "digest" require "digest"
require "time" require "time"
@@ -18,11 +19,13 @@ urls.each do |url|
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}` text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}`
begin begin
title = page_content title = CGI.unescapeHTML(
.scan(/<title[^>]*>(.*?)<\/title>/mi) page_content
.first .scan(/<title[^>]*>(.*?)<\/title>/mi)
.first .first
.strip .first
.strip
)
rescue => ex rescue => ex
warn "Title error (#{ex}; #{url})" warn "Title error (#{ex}; #{url})"
exit 1 exit 1