content-parser.ts raw
1 import {
2 EMBEDDED_EVENT_REGEX,
3 EMBEDDED_MENTION_REGEX,
4 EMOJI_SHORT_CODE_REGEX,
5 HASHTAG_REGEX,
6 LN_INVOICE_REGEX,
7 URL_REGEX,
8 WS_URL_REGEX,
9 X_URL_REGEX,
10 YOUTUBE_URL_REGEX
11 } from '@/constants'
12 import { isImage, isMedia } from './url'
13
14 export type TEmbeddedNodeType =
15 | 'text'
16 | 'image'
17 | 'images'
18 | 'media'
19 | 'event'
20 | 'mention'
21 | 'legacy-mention'
22 | 'hashtag'
23 | 'websocket-url'
24 | 'url'
25 | 'emoji'
26 | 'invoice'
27 | 'youtube'
28 | 'x-post'
29
30 export type TEmbeddedNode =
31 | {
32 type: Exclude<TEmbeddedNodeType, 'images'>
33 data: string
34 }
35 | {
36 type: 'images'
37 data: string[]
38 }
39
40 type TContentParser =
41 | { type: Exclude<TEmbeddedNodeType, 'images'>; regex: RegExp }
42 | ((content: string) => TEmbeddedNode[])
43
44 export const EmbeddedHashtagParser: TContentParser = {
45 type: 'hashtag',
46 regex: HASHTAG_REGEX
47 }
48
49 export const EmbeddedMentionParser: TContentParser = {
50 type: 'mention',
51 regex: EMBEDDED_MENTION_REGEX
52 }
53
54 export const EmbeddedLegacyMentionParser: TContentParser = {
55 type: 'legacy-mention',
56 regex: /npub1[a-z0-9]{58}|nprofile1[a-z0-9]+/g
57 }
58
59 export const EmbeddedEventParser: TContentParser = {
60 type: 'event',
61 regex: EMBEDDED_EVENT_REGEX
62 }
63
64 export const EmbeddedWebsocketUrlParser: TContentParser = {
65 type: 'websocket-url',
66 regex: WS_URL_REGEX
67 }
68
69 export const EmbeddedEmojiParser: TContentParser = {
70 type: 'emoji',
71 regex: EMOJI_SHORT_CODE_REGEX
72 }
73
74 export const EmbeddedLNInvoiceParser: TContentParser = {
75 type: 'invoice',
76 regex: LN_INVOICE_REGEX
77 }
78
79 export const EmbeddedUrlParser: TContentParser = (content: string) => {
80 const matches = content.matchAll(URL_REGEX)
81 const result: TEmbeddedNode[] = []
82 let lastIndex = 0
83 for (const match of matches) {
84 const matchStart = match.index!
85 // Add text before the match
86 if (matchStart > lastIndex) {
87 result.push({
88 type: 'text',
89 data: content.slice(lastIndex, matchStart)
90 })
91 }
92
93 const url = match[0]
94 let type: TEmbeddedNodeType = 'url'
95 if (isImage(url)) {
96 type = 'image'
97 } else if (isMedia(url)) {
98 type = 'media'
99 } else if (url.match(YOUTUBE_URL_REGEX)) {
100 type = 'youtube'
101 } else if (url.match(X_URL_REGEX)) {
102 type = 'x-post'
103 }
104
105 // Add the match as specific type
106 result.push({
107 type,
108 data: url
109 })
110
111 lastIndex = matchStart + url.length
112 }
113 // Add text after the last match
114 if (lastIndex < content.length) {
115 result.push({
116 type: 'text',
117 data: content.slice(lastIndex)
118 })
119 }
120 return result
121 }
122
123 export function parseContent(content: string, parsers: TContentParser[]) {
124 if (!content) return []
125 let nodes: TEmbeddedNode[] = [{ type: 'text', data: content.trim() }]
126
127 parsers.forEach((parser) => {
128 nodes = nodes
129 .flatMap((node) => {
130 if (node.type !== 'text') return [node]
131
132 if (typeof parser === 'function') {
133 return parser(node.data)
134 }
135
136 const matches = node.data.matchAll(parser.regex)
137 const result: TEmbeddedNode[] = []
138 let lastIndex = 0
139 for (const match of matches) {
140 const matchStart = match.index!
141 // Add text before the match
142 if (matchStart > lastIndex) {
143 result.push({
144 type: 'text',
145 data: node.data.slice(lastIndex, matchStart)
146 })
147 }
148
149 // Add the match as specific type
150 result.push({
151 type: parser.type,
152 data: match[0] // The whole matched string
153 })
154
155 lastIndex = matchStart + match[0].length
156 }
157
158 // Add text after the last match
159 if (lastIndex < node.data.length) {
160 result.push({
161 type: 'text',
162 data: node.data.slice(lastIndex)
163 })
164 }
165
166 return result
167 })
168 .filter((n) => n.data !== '')
169 })
170
171 nodes = mergeConsecutiveTextNodes(nodes)
172 nodes = mergeConsecutiveImageNodes(nodes)
173 nodes = removeExtraNewlines(nodes)
174
175 return nodes
176 }
177
178 function mergeConsecutiveTextNodes(nodes: TEmbeddedNode[]) {
179 const merged: TEmbeddedNode[] = []
180 let currentText = ''
181
182 nodes.forEach((node) => {
183 if (node.type === 'text') {
184 currentText += node.data
185 } else {
186 if (currentText) {
187 merged.push({ type: 'text', data: currentText })
188 currentText = ''
189 }
190 merged.push(node)
191 }
192 })
193
194 if (currentText) {
195 merged.push({ type: 'text', data: currentText })
196 }
197
198 return merged
199 }
200
201 function mergeConsecutiveImageNodes(nodes: TEmbeddedNode[]) {
202 const merged: TEmbeddedNode[] = []
203 nodes.forEach((node, i) => {
204 if (node.type === 'image') {
205 const lastNode = merged[merged.length - 1]
206 if (lastNode && lastNode.type === 'images') {
207 lastNode.data.push(node.data)
208 } else {
209 merged.push({ type: 'images', data: [node.data] })
210 }
211 } else if (node.type === 'text' && node.data.trim() === '') {
212 // Only remove whitespace-only text nodes if they are sandwiched between image nodes.
213 const prev = merged[merged.length - 1]
214 const next = nodes[i + 1]
215 if (prev && prev.type === 'images' && next && next.type === 'image') {
216 return // skip this whitespace node
217 } else {
218 merged.push(node)
219 }
220 } else {
221 merged.push(node)
222 }
223 })
224
225 return merged
226 }
227
228 function removeExtraNewlines(nodes: TEmbeddedNode[]) {
229 const isBlockNode = (node: TEmbeddedNode) => {
230 return ['image', 'images', 'video', 'event'].includes(node.type)
231 }
232
233 const newNodes: TEmbeddedNode[] = []
234 nodes.forEach((node, i) => {
235 if (isBlockNode(node)) {
236 newNodes.push(node)
237 return
238 }
239
240 const prev = nodes[i - 1]
241 const next = nodes[i + 1]
242 let data = node.data as string
243 if (prev && isBlockNode(prev)) {
244 data = data.replace(/^[ ]*\n/, '')
245 }
246 if (next && isBlockNode(next)) {
247 data = data.replace(/\n[ ]*$/, '')
248 }
249 newNodes.push({
250 type: node.type as Exclude<TEmbeddedNodeType, 'images'>,
251 data
252 })
253 })
254 return newNodes
255 }
256