Unescape HTML titles in archive script
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env ruby
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
|
require "cgi"
|
||||||
require "uri"
|
require "uri"
|
||||||
require "digest"
|
require "digest"
|
||||||
require "time"
|
require "time"
|
||||||
@@ -18,11 +19,13 @@ urls.each do |url|
|
|||||||
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}`
|
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}`
|
||||||
|
|
||||||
begin
|
begin
|
||||||
title = page_content
|
title = CGI.unescapeHTML(
|
||||||
|
page_content
|
||||||
.scan(/<title[^>]*>(.*?)<\/title>/mi)
|
.scan(/<title[^>]*>(.*?)<\/title>/mi)
|
||||||
.first
|
.first
|
||||||
.first
|
.first
|
||||||
.strip
|
.strip
|
||||||
|
)
|
||||||
rescue => ex
|
rescue => ex
|
||||||
warn "Title error (#{ex}; #{url})"
|
warn "Title error (#{ex}; #{url})"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
Reference in New Issue
Block a user