content-parser.ts raw

   1  import {
   2    EMBEDDED_EVENT_REGEX,
   3    EMBEDDED_MENTION_REGEX,
   4    EMOJI_SHORT_CODE_REGEX,
   5    HASHTAG_REGEX,
   6    LN_INVOICE_REGEX,
   7    URL_REGEX,
   8    WS_URL_REGEX,
   9    X_URL_REGEX,
  10    YOUTUBE_URL_REGEX
  11  } from '@/constants'
  12  import { isImage, isMedia } from './url'
  13  
  14  export type TEmbeddedNodeType =
  15    | 'text'
  16    | 'image'
  17    | 'images'
  18    | 'media'
  19    | 'event'
  20    | 'mention'
  21    | 'legacy-mention'
  22    | 'hashtag'
  23    | 'websocket-url'
  24    | 'url'
  25    | 'emoji'
  26    | 'invoice'
  27    | 'youtube'
  28    | 'x-post'
  29  
  30  export type TEmbeddedNode =
  31    | {
  32        type: Exclude<TEmbeddedNodeType, 'images'>
  33        data: string
  34      }
  35    | {
  36        type: 'images'
  37        data: string[]
  38      }
  39  
  40  type TContentParser =
  41    | { type: Exclude<TEmbeddedNodeType, 'images'>; regex: RegExp }
  42    | ((content: string) => TEmbeddedNode[])
  43  
  44  export const EmbeddedHashtagParser: TContentParser = {
  45    type: 'hashtag',
  46    regex: HASHTAG_REGEX
  47  }
  48  
  49  export const EmbeddedMentionParser: TContentParser = {
  50    type: 'mention',
  51    regex: EMBEDDED_MENTION_REGEX
  52  }
  53  
  54  export const EmbeddedLegacyMentionParser: TContentParser = {
  55    type: 'legacy-mention',
  56    regex: /npub1[a-z0-9]{58}|nprofile1[a-z0-9]+/g
  57  }
  58  
  59  export const EmbeddedEventParser: TContentParser = {
  60    type: 'event',
  61    regex: EMBEDDED_EVENT_REGEX
  62  }
  63  
  64  export const EmbeddedWebsocketUrlParser: TContentParser = {
  65    type: 'websocket-url',
  66    regex: WS_URL_REGEX
  67  }
  68  
  69  export const EmbeddedEmojiParser: TContentParser = {
  70    type: 'emoji',
  71    regex: EMOJI_SHORT_CODE_REGEX
  72  }
  73  
  74  export const EmbeddedLNInvoiceParser: TContentParser = {
  75    type: 'invoice',
  76    regex: LN_INVOICE_REGEX
  77  }
  78  
  79  export const EmbeddedUrlParser: TContentParser = (content: string) => {
  80    const matches = content.matchAll(URL_REGEX)
  81    const result: TEmbeddedNode[] = []
  82    let lastIndex = 0
  83    for (const match of matches) {
  84      const matchStart = match.index!
  85      // Add text before the match
  86      if (matchStart > lastIndex) {
  87        result.push({
  88          type: 'text',
  89          data: content.slice(lastIndex, matchStart)
  90        })
  91      }
  92  
  93      const url = match[0]
  94      let type: TEmbeddedNodeType = 'url'
  95      if (isImage(url)) {
  96        type = 'image'
  97      } else if (isMedia(url)) {
  98        type = 'media'
  99      } else if (url.match(YOUTUBE_URL_REGEX)) {
 100        type = 'youtube'
 101      } else if (url.match(X_URL_REGEX)) {
 102        type = 'x-post'
 103      }
 104  
 105      // Add the match as specific type
 106      result.push({
 107        type,
 108        data: url
 109      })
 110  
 111      lastIndex = matchStart + url.length
 112    }
 113    // Add text after the last match
 114    if (lastIndex < content.length) {
 115      result.push({
 116        type: 'text',
 117        data: content.slice(lastIndex)
 118      })
 119    }
 120    return result
 121  }
 122  
 123  export function parseContent(content: string, parsers: TContentParser[]) {
 124    if (!content) return []
 125    let nodes: TEmbeddedNode[] = [{ type: 'text', data: content.trim() }]
 126  
 127    parsers.forEach((parser) => {
 128      nodes = nodes
 129        .flatMap((node) => {
 130          if (node.type !== 'text') return [node]
 131  
 132          if (typeof parser === 'function') {
 133            return parser(node.data)
 134          }
 135  
 136          const matches = node.data.matchAll(parser.regex)
 137          const result: TEmbeddedNode[] = []
 138          let lastIndex = 0
 139          for (const match of matches) {
 140            const matchStart = match.index!
 141            // Add text before the match
 142            if (matchStart > lastIndex) {
 143              result.push({
 144                type: 'text',
 145                data: node.data.slice(lastIndex, matchStart)
 146              })
 147            }
 148  
 149            // Add the match as specific type
 150            result.push({
 151              type: parser.type,
 152              data: match[0] // The whole matched string
 153            })
 154  
 155            lastIndex = matchStart + match[0].length
 156          }
 157  
 158          // Add text after the last match
 159          if (lastIndex < node.data.length) {
 160            result.push({
 161              type: 'text',
 162              data: node.data.slice(lastIndex)
 163            })
 164          }
 165  
 166          return result
 167        })
 168        .filter((n) => n.data !== '')
 169    })
 170  
 171    nodes = mergeConsecutiveTextNodes(nodes)
 172    nodes = mergeConsecutiveImageNodes(nodes)
 173    nodes = removeExtraNewlines(nodes)
 174  
 175    return nodes
 176  }
 177  
 178  function mergeConsecutiveTextNodes(nodes: TEmbeddedNode[]) {
 179    const merged: TEmbeddedNode[] = []
 180    let currentText = ''
 181  
 182    nodes.forEach((node) => {
 183      if (node.type === 'text') {
 184        currentText += node.data
 185      } else {
 186        if (currentText) {
 187          merged.push({ type: 'text', data: currentText })
 188          currentText = ''
 189        }
 190        merged.push(node)
 191      }
 192    })
 193  
 194    if (currentText) {
 195      merged.push({ type: 'text', data: currentText })
 196    }
 197  
 198    return merged
 199  }
 200  
 201  function mergeConsecutiveImageNodes(nodes: TEmbeddedNode[]) {
 202    const merged: TEmbeddedNode[] = []
 203    nodes.forEach((node, i) => {
 204      if (node.type === 'image') {
 205        const lastNode = merged[merged.length - 1]
 206        if (lastNode && lastNode.type === 'images') {
 207          lastNode.data.push(node.data)
 208        } else {
 209          merged.push({ type: 'images', data: [node.data] })
 210        }
 211      } else if (node.type === 'text' && node.data.trim() === '') {
 212        // Only remove whitespace-only text nodes if they are sandwiched between image nodes.
 213        const prev = merged[merged.length - 1]
 214        const next = nodes[i + 1]
 215        if (prev && prev.type === 'images' && next && next.type === 'image') {
 216          return // skip this whitespace node
 217        } else {
 218          merged.push(node)
 219        }
 220      } else {
 221        merged.push(node)
 222      }
 223    })
 224  
 225    return merged
 226  }
 227  
 228  function removeExtraNewlines(nodes: TEmbeddedNode[]) {
 229    const isBlockNode = (node: TEmbeddedNode) => {
 230      return ['image', 'images', 'video', 'event'].includes(node.type)
 231    }
 232  
 233    const newNodes: TEmbeddedNode[] = []
 234    nodes.forEach((node, i) => {
 235      if (isBlockNode(node)) {
 236        newNodes.push(node)
 237        return
 238      }
 239  
 240      const prev = nodes[i - 1]
 241      const next = nodes[i + 1]
 242      let data = node.data as string
 243      if (prev && isBlockNode(prev)) {
 244        data = data.replace(/^[ ]*\n/, '')
 245      }
 246      if (next && isBlockNode(next)) {
 247        data = data.replace(/\n[ ]*$/, '')
 248      }
 249      newNodes.push({
 250        type: node.type as Exclude<TEmbeddedNodeType, 'images'>,
 251        data
 252      })
 253    })
 254    return newNodes
 255  }
 256