Swiftgram/submodules/BrowserUI/Sources/BrowserReadability.swift
2024-08-29 11:44:05 +04:00

586 lines
20 KiB
Swift

import Foundation
import WebKit
import AppBundle
import Postbox
import TelegramCore
import InstantPageUI
public class Readability: NSObject, WKNavigationDelegate {
private let url: URL
let webView: WKWebView
private let completionHandler: ((_ webPage: (TelegramMediaWebpage, [Any]?)?, _ error: Error?) -> Void)
private var hasRenderedReadabilityHTML = false
private var subresources: [Any]?
init(url: URL, archiveData: Data, completionHandler: @escaping (_ webPage: (TelegramMediaWebpage, [Any]?)?, _ error: Error?) -> Void) {
self.url = url
self.completionHandler = completionHandler
let preferences = WKPreferences()
let configuration = WKWebViewConfiguration()
configuration.preferences = preferences
configuration.userContentController.addUserScript(ReadabilityUserScript())
self.webView = WKWebView(frame: CGRect.zero, configuration: configuration)
super.init()
self.webView.configuration.suppressesIncrementalRendering = true
self.webView.navigationDelegate = self
if #available(iOS 16.4, *) {
self.webView.isInspectable = true
}
if let (html, subresources) = extractHtmlString(from: archiveData) {
self.subresources = subresources
self.webView.loadHTMLString(html, baseURL: url.baseURL)
}
}
private func initializeReadability(completion: @escaping (_ result: TelegramMediaWebpage?, _ error: Error?) -> Void) {
guard let readabilityInitializationJS = loadFile(name: "ReaderMode", type: "js") else {
return
}
self.webView.evaluateJavaScript(readabilityInitializationJS) { (result, error) in
guard let result = result as? [String: Any] else {
completion(nil, error)
return
}
guard let page = parseJson(result, url: self.url.absoluteString) else {
return
}
completion(page, nil)
}
}
public func webView(_ webView: WKWebView, didFinish navigation: WKNavigation!) {
if !self.hasRenderedReadabilityHTML {
self.initializeReadability() { [weak self] (webPage: TelegramMediaWebpage?, error: Error?) in
guard let self else {
return
}
self.hasRenderedReadabilityHTML = true
guard let webPage else {
self.completionHandler(nil, error)
return
}
self.completionHandler((webPage, self.subresources), error)
}
}
}
}
class ReadabilityUserScript: WKUserScript {
convenience override init() {
guard let js = loadFile(name: "Readability", type: "js") else {
fatalError()
}
self.init(source: js, injectionTime: .atDocumentEnd, forMainFrameOnly: true)
}
}
func loadFile(name: String, type: String) -> String? {
let bundle = getAppBundle()
guard let userScriptPath = bundle.path(forResource: name, ofType: type) else {
return nil
}
guard let userScriptData = try? Data(contentsOf: URL(fileURLWithPath: userScriptPath)) else {
return nil
}
guard let userScript = String(data: userScriptData, encoding: .utf8) else {
return nil
}
return userScript
}
private func extractHtmlString(from webArchiveData: Data) -> (String, [Any]?)? {
if let webArchiveDict = try? PropertyListSerialization.propertyList(from: webArchiveData, format: nil) as? [String: Any],
let mainResource = webArchiveDict["WebMainResource"] as? [String: Any],
let htmlData = mainResource["WebResourceData"] as? Data {
guard let htmlString = String(data: htmlData, encoding: .utf8) else {
return nil
}
return (htmlString, webArchiveDict["WebSubresources"] as? [Any])
}
return nil
}
private func parseJson(_ input: [String: Any], url: String) -> TelegramMediaWebpage? {
let siteName = input["siteName"] as? String
let title = input["title"] as? String
let byline = input["byline"] as? String
let excerpt = input["excerpt"] as? String
var media: [MediaId: Media] = [:]
let blocks = parseContent(input, url, &media)
guard !blocks.isEmpty else {
return nil
}
return TelegramMediaWebpage(
webpageId: MediaId(namespace: 0, id: 0),
content: .Loaded(
TelegramMediaWebpageLoadedContent(
url: url,
displayUrl: url,
hash: 0,
type: "article",
websiteName: siteName,
title: title,
text: excerpt,
embedUrl: nil,
embedType: nil,
embedSize: nil,
duration: nil,
author: byline,
isMediaLargeByDefault: nil,
image: nil,
file: nil,
story: nil,
attributes: [],
instantPage: InstantPage(
blocks: blocks,
media: media,
isComplete: true,
rtl: false,
url: url,
views: nil
)
)
)
)
}
private func parseContent(_ input: [String: Any], _ url: String, _ media: inout [MediaId: Media]) -> [InstantPageBlock] {
let title = input["title"] as? String
let byline = input["byline"] as? String
let date = input["publishedTime"] as? String
let _ = date
guard let content = input["content"] as? [Any] else {
return []
}
var blocks = parsePageBlocks(content, url, &media)
if case .header = blocks.first {
} else {
if var byline {
byline = byline.replacingOccurrences(of: "[\n\t]+", with: " ", options: .regularExpression, range: nil)
blocks.insert(.authorDate(author: trim(parseRichText(byline)), date: 0), at: 0)
}
if let title {
blocks.insert(.title(trim(parseRichText(title))), at: 0)
}
}
return blocks
}
private func parseRichText(_ input: String) -> RichText {
return .plain(input)
}
private func parseRichText(_ input: [String: Any], _ media: inout [MediaId: Media]) -> RichText {
var text: RichText
if let string = input["content"] as? String {
text = parseRichText(string)
} else if let array = input["content"] as? [Any] {
text = parseRichText(array, &media)
} else {
text = .empty
}
text = applyAnchor(text, item: input)
if let _ = input["bold"] {
text = .bold(text)
}
if let _ = input["italic"] {
text = .italic(text)
}
return text
}
private func parseRichText(_ input: [Any], _ media: inout [MediaId: Media]) -> RichText {
var result: [RichText] = []
for item in input {
if let string = item as? String {
result.append(parseRichText(string))
} else if let item = item as? [String: Any], let tag = item["tag"] as? String {
var text: RichText?
switch tag {
case "b", "strong":
text = .bold(parseRichText(item, &media))
case "i":
text = .italic(parseRichText(item, &media))
case "s":
text = .strikethrough(parseRichText(item, &media))
case "p":
text = parseRichText(item, &media)
case "a":
if let href = item["href"] as? String {
let telString = "tel:"
let mailtoString = "mailto:"
if href.hasPrefix("tel:") {
text = .phone(text: parseRichText(item, &media), phone: String(href[href.index(href.startIndex, offsetBy: telString.distance(from: telString.startIndex, to: telString.endIndex))...]))
} else if href.hasPrefix(mailtoString) {
text = .email(text: parseRichText(item, &media), email: String(href[href.index(href.startIndex, offsetBy: mailtoString.distance(from: mailtoString.startIndex, to: mailtoString.endIndex))...]))
} else {
text = .url(text: parseRichText(item, &media), url: href, webpageId: nil)
}
} else {
text = parseRichText(item, &media)
}
case "pre", "code":
text = .fixed(parseRichText(item, &media))
case "mark":
text = .marked(parseRichText(item, &media))
case "sub":
text = .subscript(parseRichText(item, &media))
case "sup":
text = .superscript(parseRichText(item, &media))
case "img":
if let src = item["src"] as? String, !src.isEmpty {
let width: Int32
if let value = item["width"] as? String, let intValue = Int32(value) {
width = intValue
} else {
width = 0
}
let height: Int32
if let value = item["height"] as? String, let intValue = Int32(value) {
height = intValue
} else {
height = 0
}
let id = MediaId(namespace: Namespaces.Media.CloudFile, id: Int64(media.count))
media[id] = TelegramMediaImage(
imageId: id,
representations: [
TelegramMediaImageRepresentation(
dimensions: PixelDimensions(width: width, height: height),
resource: InstantPageExternalMediaResource(url: src),
progressiveSizes: [],
immediateThumbnailData: nil
)
],
immediateThumbnailData: nil,
reference: nil,
partialReference: nil,
flags: []
)
text = .image(id: id, dimensions: PixelDimensions(width: width, height: height))
}
case "br":
if let last = result.last {
result[result.count - 1] = addNewLine(last)
}
default:
text = parseRichText(item, &media)
}
if var text {
text = applyAnchor(text, item: item)
result.append(text)
}
}
}
if !result.isEmpty {
return .concat(result)
} else if result.count == 1, let text = result.first {
return text
} else {
return .empty
}
}
private func trimStart(_ input: RichText) -> RichText {
return input
}
private func trimEnd(_ input: RichText) -> RichText {
return input
}
private func trim(_ input: RichText) -> RichText {
return input
}
private func addNewLine(_ input: RichText) -> RichText {
var text = input
switch input {
case .empty:
text = .empty
case let .plain(string):
text = .plain(string + "\n")
case let .bold(richText):
text = .bold(addNewLine(richText))
case let .italic(richText):
text = .italic(addNewLine(richText))
case let .underline(richText):
text = .underline(addNewLine(richText))
case let .strikethrough(richText):
text = .strikethrough(addNewLine(richText))
case let .fixed(richText):
text = .fixed(addNewLine(richText))
case let .url(richText, url, webpageId):
text = .url(text: addNewLine(richText), url: url, webpageId: webpageId)
case let .email(richText, email):
text = .email(text: addNewLine(richText), email: email)
case let .subscript(richText):
text = .subscript(addNewLine(richText))
case let .superscript(richText):
text = .superscript(addNewLine(richText))
case let .marked(richText):
text = .marked(addNewLine(richText))
case let .phone(richText, phone):
text = .phone(text: addNewLine(richText), phone: phone)
case let .anchor(richText, name):
text = .anchor(text: addNewLine(richText), name: name)
case var .concat(array):
array[array.count - 1] = addNewLine(array[array.count - 1])
text = .concat(array)
case .image:
break
}
return text
}
private func applyAnchor(_ input: RichText, item: [String: Any]) -> RichText {
guard let id = item["id"] as? String, !id.isEmpty else {
return input
}
return .anchor(text: input, name: id)
}
private func parseTable(_ input: [String: Any], _ media: inout [MediaId: Media]) -> InstantPageBlock {
let title = (input["title"] as? String) ?? ""
return .table(
title: trim(applyAnchor(parseRichText(title), item: input)),
rows: parseTableRows((input["content"] as? [Any]) ?? [], &media),
bordered: true,
striped: true
)
}
private func parseTableRows(_ input: [Any], _ media: inout [MediaId: Media]) -> [InstantPageTableRow] {
var result: [InstantPageTableRow] = []
for item in input {
if let item = item as? [String: Any] {
let tag = item["tag"] as? String
if tag == "tr" {
result.append(parseTableRow(item, &media))
} else if let content = item["content"] as? [Any] {
result.append(contentsOf: parseTableRows(content, &media))
}
}
}
return result
}
private func parseTableRow(_ input: [String: Any], _ media: inout [MediaId: Media]) -> InstantPageTableRow {
var cells: [InstantPageTableCell] = []
if let content = input["content"] as? [Any] {
for item in content {
guard let item = item as? [String: Any] else {
continue
}
let tag = item["tag"] as? String
guard ["td", "th"].contains(tag) else {
continue
}
var text: RichText?
if let content = item["content"] as? [Any] {
text = trim(parseRichText(content, &media))
if let currentText = text {
if let _ = item["bold"] {
text = .bold(currentText)
}
if let _ = item["italic"] {
text = .italic(currentText)
}
}
}
cells.append(InstantPageTableCell(
text: text,
header: tag == "th",
alignment: item["xcenter"] != nil ? .center : .left,
verticalAlignment: .middle,
colspan: ((item["colspan"] as? String).flatMap { Int32($0) }) ?? 0,
rowspan: ((item["rowspan"] as? String).flatMap { Int32($0) }) ?? 0
))
}
}
return InstantPageTableRow(cells: cells)
}
private func parseDetails(_ item: [String: Any], _ url: String, _ media: inout [MediaId: Media]) -> InstantPageBlock? {
guard var content = item["contant"] as? [Any] else {
return nil
}
var title: RichText = .empty
var titleIndex: Int?
for i in 0 ..< content.count {
if let subitem = content[i] as? [String: Any], let tag = subitem["tag"] as? String, tag == "summary" {
title = trim(parseRichText(subitem, &media))
titleIndex = i
break
}
}
if let titleIndex {
content.remove(at: titleIndex)
}
return .details(
title: title,
blocks: parsePageBlocks(content, url, &media),
expanded: item["open"] != nil
)
}
private func parseList(_ input: [String: Any], _ media: inout [MediaId: Media]) -> InstantPageBlock? {
guard let content = input["content"] as? [Any], let tag = input["tag"] as? String else {
return nil
}
var items: [InstantPageListItem] = []
for item in content {
guard let item = item as? [String: Any], let tag = item["tag"] as? String, tag == "li" else {
continue
}
items.append(.text(trim(parseRichText(item, &media)), nil))
}
let ordered = tag == "ol"
return .list(items: items, ordered: ordered)
}
private func parseImage(_ input: [String: Any], _ media: inout [MediaId: Media]) -> InstantPageBlock? {
guard let src = input["src"] as? String else {
return nil
}
let caption: InstantPageCaption
if let alt = input["alt"] as? String {
caption = InstantPageCaption(
text: trim(parseRichText(alt)),
credit: .empty
)
} else {
caption = InstantPageCaption(text: .empty, credit: .empty)
}
let width: Int32
if let value = input["width"] as? String, let intValue = Int32(value) {
width = intValue
} else {
width = 0
}
let height: Int32
if let value = input["height"] as? String, let intValue = Int32(value) {
height = intValue
} else {
height = 0
}
let id = MediaId(namespace: Namespaces.Media.CloudImage, id: Int64(media.count))
media[id] = TelegramMediaImage(
imageId: id,
representations: [
TelegramMediaImageRepresentation(
dimensions: PixelDimensions(width: width, height: height),
resource: InstantPageExternalMediaResource(url: src),
progressiveSizes: [],
immediateThumbnailData: nil
)
],
immediateThumbnailData: nil,
reference: nil,
partialReference: nil,
flags: []
)
return .image(
id: id,
caption: caption,
url: nil,
webpageId: nil
)
}
private func parseFigure(_ input: [String: Any], _ media: inout [MediaId: Media]) -> InstantPageBlock? {
guard let content = input["content"] as? [Any] else {
return nil
}
var block: InstantPageBlock?
var caption: RichText?
for item in content {
if let item = item as? [String: Any], let tag = item["tag"] as? String {
if tag == "img" {
block = parseImage(item, &media)
} else if tag == "figurecaption" {
caption = trim(parseRichText(item, &media))
}
}
}
guard var block else {
return nil
}
if let caption, case let .image(id, _, url, webpageId) = block {
block = .image(id: id, caption: InstantPageCaption(text: caption, credit: .empty), url: url, webpageId: webpageId)
}
return block
}
private func parsePageBlocks(_ input: [Any], _ url: String, _ media: inout [MediaId: Media]) -> [InstantPageBlock] {
var result: [InstantPageBlock] = []
for item in input {
if let string = item as? String {
result.append(.paragraph(parseRichText(string)))
} else if let item = item as? [String: Any], let tag = item["tag"] as? String {
let content = item["content"] as? [Any]
switch tag {
case "p":
result.append(.paragraph(trim(parseRichText(item, &media))))
case "h1", "h2":
result.append(.header(trim(parseRichText(item, &media))))
case "h3", "h4", "h5", "h6":
result.append(.subheader(trim(parseRichText(item, &media))))
case "pre":
result.append(.preformatted(.fixed(trim(parseRichText(item, &media)))))
case "blockquote":
result.append(.blockQuote(text: .italic(trim(parseRichText(item, &media))), caption: .empty))
case "img":
if let image = parseImage(item, &media) {
result.append(image)
}
break
case "figure":
if let figure = parseFigure(item, &media) {
result.append(figure)
}
case "table":
result.append(parseTable(item, &media))
case "ul", "ol":
if let list = parseList(item, &media) {
result.append(list)
}
case "hr":
result.append(.divider)
case "details":
if let details = parseDetails(item, url, &media) {
result.append(details)
}
default:
if let content {
result.append(contentsOf: parsePageBlocks(content, url, &media))
}
}
}
}
return result
}