Nuxt3-SEO利剑之 一键生成网站图片的结构化数据

159 阅读4分钟

背景: 在Nuxt3等具备SSR的 框架,做web站点的场景下, 我们如果做网站的 SEO优化, 就需要生成去生成网站的结构化数据,让搜索引擎更好的了解我们的网站。

但是这个过程,又比较繁琐。

网站结构化数据,大概长这样

<script type='application/ld+json'>
[
    {
        "@context": "https://schema.org",
        "@type": "ImageObject",
        "url": "https://storyblok-cdn.photoroom.com/f/191576/480x270/cc402e6795/batch_mode.webp",
        "width": 480,
        "height": 270,
        "encodingFormat": "image/webp"
    }
]
</script>

一个网站大概会有 几十上百个图片, 挨个去写,非常的费劲。还有可能写错。

面临的困扰

  • 工作量大 - 重复性多
  • 结构化数据容易出错-出错,爬虫就不认识
  • 图片在开发环境和生成环境,不一定一致; 开发环境可能是一个 Http请求, 生产环境,可能是 base64;

简单写了一个脚本,可以一键生成图片的结构化数据

开发环境

pnpm init .

pnpm add cheerio image-size mime -S
node: 20+
  • cheerio 解析HTML
  • image-size 解析图片大小
  • mime 根据图片的 type 生成标准的 MIME 类型, 不要自己手动拼接,不标准

实现思路

  1. 拿到我们某一个页面的HTML 字符串 ,注意,是字符串,不是 chrome里面已经给你解析好的DOM树
  2. 从一堆字符串中解析我们要的内容; 根据图片的存在形式,进行逐一分析
  3. 拿到图片的url 地址
  4. 遍历,拿到的这一堆 url 地址, 拿到 文件流信息, url本身是字符串,图片的宽高是存在文件信息中的, 我们使用 fetch 请求, 使用 axios等其他也都可以
  5. 拿到文件流以后,使用 image-size 去解析并返回文件的宽度,高度,类型
  6. 这里的类型只有 webp, svg, png 等, 需要使用 mime 这个包,将这种解析为 'image/png', 'image/webp' 等标准的格式
  7. 将json 拼接为 符合 schema org 格式的 结构化数据
  8. 结束。

代码

import * as cheerio from 'cheerio'
import imageSize from 'image-size'
import mime from 'mime'

import fs from 'node:fs'

let host = 'https://www.photoroom.com/'

const getHTMLText = async () => {
  try {
    let res = await fetch(host)

    let text = await res.text()

    return text
  } catch (error) {
    console.error('获取网页内容失败', error)
  }
}

/**
 * 获取外部链接的css的 图片地址
 */
const getCssLinksImages = ($) => {
  let images = []

  // 拿到所有的 style 标签
  $('style').each((index, element) => {
    const styleContent = $(element).html()
    const matches = styleContent.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
    if (matches) {
      matches.forEach((match) => {
        const urlMatch = match.match(/url\(['"]?(.*?)['"]?\)/)
        if (urlMatch && urlMatch[1] && !urlMatch[1].startsWith('data:image')) {
          images.push(urlMatch[1])
        }
      })
    }
  })

  return images
}

/**
 * 获取 <img src=''> 这种的图片地址
 * @returns
 */
const getImageTagImages = ($) => {
  let images = []

  // 提取 img 标签中的图片链接
  $('img').each((index, element) => {
    // 当然,这里可能会存在有 懒加载的情况, 具体网站,具体分析,只要是懒加载,一般是解析 data-src= 地址
    const src = $(element).attr('src')
    if (src && src.indexOf('data:image') === -1) {
      images.push(src)
    }
  })

  return images
}

/**
 * 获取行内样式中的背景图片链接
 */
const getInlineStyleTagImages = ($) => {
  let images = []

  // 提取行内样式中的背景图片链接
  $('[style]').each((index, element) => {
    const style = $(element).attr('style')
    const match = style.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
    if (match && match[1]) {
      const imageUrl = match[1]
      if (!imageUrl.startsWith('data:image')) {
        images.push(imageUrl)
      }
    }
  })

  return images
}

/**
 * 获取 style 标签内的 图片资源
 */
const getStyleTagsImages = ($) => {
  let images = []

  // 提取style 标签内的 图片资源
  $('style').each((index, element) => {
    const styleContent = $(element).html()
    const matches = styleContent.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
    if (matches) {
      matches.forEach((match) => {
        const urlMatch = match.match(/url\(['"]?(.*?)['"]?\)/)
        if (urlMatch && urlMatch[1] && !urlMatch[1].startsWith('data:image')) {
          images.push(urlMatch[1])
        }
      })
    }
  })

  return images
}

/**
 * 解析网站所有的图片资源
 */
const resolveImage = (text) => {
  let $ = cheerio.load(text)

  let imageTags = getImageTagImages($)

  let inlineImages = getInlineStyleTagImages($)

  let styleTagImages = getStyleTagsImages($)

  // 提取外部css链接中的图片资源
  let cssLinksImages = getCssLinksImages($, text)

  return imageTags.concat(inlineImages, styleTagImages, cssLinksImages)
}

const resolveImageSize = async (list) => {
  let count = 0

  let allImagePromise = list.map((p) => {
    return fetch(p, {
      headers: { responseType: 'arraybuffer' }
    }).then(async (response) => {
      count++

      console.log('图片信息解析成功:', count)
      const buffer = await response.arrayBuffer()
      const uint8Array = new Uint8Array(buffer)

      let item = await imageSize(uint8Array)

      return {
        src: p,
        width: item.width,
        height: item.height,
        // 图片类型
        type: mime.getType(item.type)
      }
    })
  })

  try {
    let result = await Promise.all(allImagePromise)

    // 过滤掉 svg这样小的图片, 一般不会作为结构化数据
    return result.filter(p => p.type !== 'image/svg+xml')
  } catch (error) {
    console.log('图片宽高解析失败')
    console.log(error)
  }
}

/**
 * 包装成 application/ld+json 结构化数据
 * @param {*} list
 * @returns
 */
const wrapJsonLd = (list) => {
  return list.map((item) => {
    return {
      '@context': 'https://schema.org',
      '@type': 'ImageObject',
      url: item.src,
      width: item.width,
      height: item.height,
      encodingFormat: `${item.type}`
    }
  })
}

const start = async () => {
  let text = await getHTMLText()

  if (!text) {
    console.log('网站的内容没有解析到')
    return
  }

  let imageList = resolveImage(text)

  // 需要去补全 域名
  imageList = imageList.map((item) => {
    return item.startsWith('http') ? item : host + item
  })

  // 解析图片的宽高
  let imageSizeList = await resolveImageSize(imageList)

  // 包装 application/ld+json 结构化数据
  let jsonLd = wrapJsonLd(imageSizeList)

  fs.writeFile('./web-application.json', JSON.stringify(jsonLd, null, 2), (err) => {
    if (err) {
      console.log('结构化数据写入失败')
      console.log(err)
      return
    }

    console.log('结构化数据写入成功')
  })
}

start()

结果

[
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/480x270/cc402e6795/batch_mode.webp",
    "width": 480,
    "height": 270,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1599x900/a67bb8b540/process_hundreds_of_product_images_using_the_photoroom_api_m_.webp",
    "width": 1599,
    "height": 900,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Fobject-box.webp&w=640&q=75",
    "width": 513,
    "height": 574,
    "encodingFormat": "image/png"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Flaptop.webp&w=1080&q=75",
    "width": 992,
    "height": 1280,
    "encodingFormat": "image/png"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Flaptop.webp&w=1080&q=75",
    "width": 992,
    "height": 1280,
    "encodingFormat": "image/png"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/87fe0b04a9/shopify.webp",
    "width": 280,
    "height": 158,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/b5863f3ad6/netflix.webp",
    "width": 280,
    "height": 158,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/00ec587061/warner-bros.webp",
    "width": 280,
    "height": 158,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/2f44770ec6/bulgari.webp",
    "width": 280,
    "height": 158,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/dfa9052663/wolt.webp",
    "width": 280,
    "height": 158,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/420x237/8b250b18ca/daiso.webp",
    "width": 420,
    "height": 237,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/420x237/4ddbfa3ef8/hennessy.webp",
    "width": 420,
    "height": 237,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/834e23c71b/klarna.webp",
    "width": 224,
    "height": 122,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1048x764/424f986791/instant_backgrounds_examples.webp",
    "width": 1048,
    "height": 764,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/2096x1572/3240e37ea7/ai_expand.webp",
    "width": 2096,
    "height": 1572,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1048x786/fdcad8feb6/ebay_2.webp",
    "width": 1048,
    "height": 786,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/e422e0c6b2/logo_free.webp",
    "width": 96,
    "height": 96,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/e422e0c6b2/logo_free.webp",
    "width": 96,
    "height": 96,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/18116b9633/logo_enterprise.webp",
    "width": 96,
    "height": 96,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/7c73842d7b/sebastian-pilch.webp",
    "width": 80,
    "height": 80,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/cd03d1c011/sue-darte.webp",
    "width": 80,
    "height": 80,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/82f8c1c75b/jacob-p.webp",
    "width": 80,
    "height": 80,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://www.photoroom.com//_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fphone-static.ff9096fe.webp&w=3840&q=75",
    "width": 440,
    "height": 802,
    "encodingFormat": "image/png"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://www.photoroom.com//_next/image?url=%2F_next%2Fstatic%2Fmedia%2Flaptop.ffe2a6e2.webp&w=3840&q=75",
    "width": 496,
    "height": 640,
    "encodingFormat": "image/png"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/4e54b928ef/remove_background.webp",
    "width": 1200,
    "height": 800,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/85085e1807/ai_backgrounds_after_.webp",
    "width": 1200,
    "height": 800,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/1fb223a626/blur_background_after_.webp",
    "width": 1200,
    "height": 800,
    "encodingFormat": "image/webp"
  },
  {
    "@context": "https://schema.org",
    "@type": "ImageObject",
    "url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/88361a19c7/retouch_before_.webp",
    "width": 1200,
    "height": 800,
    "encodingFormat": "image/webp"
  }
]

生成的结果,直接可以放在 validator.schema.org/ 上进行验证;

image.png

✿✿ヽ(°▽°)ノ✿ 全部验证通过。😊😊😊