背景: 在Nuxt3等具备SSR的 框架,做web站点的场景下, 我们如果做网站的 SEO优化
, 就需要生成去生成网站的结构化数据
,让搜索引擎更好的了解我们的网站。
但是这个过程,又比较繁琐。
网站结构化数据,大概长这样
<script type='application/ld+json'>
[
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/480x270/cc402e6795/batch_mode.webp",
"width": 480,
"height": 270,
"encodingFormat": "image/webp"
}
]
</script>
一个网站大概会有 几十上百个图片, 挨个去写,非常的费劲。还有可能写错。
面临的困扰
- 工作量大 - 重复性多
- 结构化数据容易出错-出错,爬虫就不认识
- 图片在开发环境和生成环境,不一定一致; 开发环境可能是一个 Http请求, 生产环境,可能是 base64;
简单写了一个脚本,可以一键生成图片
的结构化数据
开发环境
pnpm init .
pnpm add cheerio image-size mime -S
node: 20+
- cheerio 解析HTML
- image-size 解析图片大小
- mime 根据图片的 type 生成标准的 MIME 类型, 不要自己手动拼接,不标准
实现思路
- 拿到我们某一个页面的HTML
字符串
,注意,是字符串,不是 chrome里面已经给你解析好的DOM树 - 从一堆字符串中解析我们要的内容; 根据图片的存在形式,进行逐一分析
- 拿到图片的url 地址
- 遍历,拿到的这一堆 url 地址, 拿到 文件流信息, url本身是字符串,图片的宽高是存在文件信息中的, 我们使用 fetch 请求, 使用 axios等其他也都可以
- 拿到文件流以后,使用 image-size 去解析并返回文件的宽度,高度,类型
- 这里的类型只有 webp, svg, png 等, 需要使用 mime 这个包,将这种解析为 'image/png', 'image/webp' 等标准的格式
- 将json 拼接为 符合 schema org 格式的 结构化数据
- 结束。
代码
import * as cheerio from 'cheerio'
import imageSize from 'image-size'
import mime from 'mime'
import fs from 'node:fs'
let host = 'https://www.photoroom.com/'
const getHTMLText = async () => {
try {
let res = await fetch(host)
let text = await res.text()
return text
} catch (error) {
console.error('获取网页内容失败', error)
}
}
/**
* 获取外部链接的css的 图片地址
*/
const getCssLinksImages = ($) => {
let images = []
// 拿到所有的 style 标签
$('style').each((index, element) => {
const styleContent = $(element).html()
const matches = styleContent.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
if (matches) {
matches.forEach((match) => {
const urlMatch = match.match(/url\(['"]?(.*?)['"]?\)/)
if (urlMatch && urlMatch[1] && !urlMatch[1].startsWith('data:image')) {
images.push(urlMatch[1])
}
})
}
})
return images
}
/**
* 获取 <img src=''> 这种的图片地址
* @returns
*/
const getImageTagImages = ($) => {
let images = []
// 提取 img 标签中的图片链接
$('img').each((index, element) => {
// 当然,这里可能会存在有 懒加载的情况, 具体网站,具体分析,只要是懒加载,一般是解析 data-src= 地址
const src = $(element).attr('src')
if (src && src.indexOf('data:image') === -1) {
images.push(src)
}
})
return images
}
/**
* 获取行内样式中的背景图片链接
*/
const getInlineStyleTagImages = ($) => {
let images = []
// 提取行内样式中的背景图片链接
$('[style]').each((index, element) => {
const style = $(element).attr('style')
const match = style.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
if (match && match[1]) {
const imageUrl = match[1]
if (!imageUrl.startsWith('data:image')) {
images.push(imageUrl)
}
}
})
return images
}
/**
* 获取 style 标签内的 图片资源
*/
const getStyleTagsImages = ($) => {
let images = []
// 提取style 标签内的 图片资源
$('style').each((index, element) => {
const styleContent = $(element).html()
const matches = styleContent.match(/background(?:-image)?:\s*url\(['"]?(.*?)['"]?\)/g)
if (matches) {
matches.forEach((match) => {
const urlMatch = match.match(/url\(['"]?(.*?)['"]?\)/)
if (urlMatch && urlMatch[1] && !urlMatch[1].startsWith('data:image')) {
images.push(urlMatch[1])
}
})
}
})
return images
}
/**
* 解析网站所有的图片资源
*/
const resolveImage = (text) => {
let $ = cheerio.load(text)
let imageTags = getImageTagImages($)
let inlineImages = getInlineStyleTagImages($)
let styleTagImages = getStyleTagsImages($)
// 提取外部css链接中的图片资源
let cssLinksImages = getCssLinksImages($, text)
return imageTags.concat(inlineImages, styleTagImages, cssLinksImages)
}
const resolveImageSize = async (list) => {
let count = 0
let allImagePromise = list.map((p) => {
return fetch(p, {
headers: { responseType: 'arraybuffer' }
}).then(async (response) => {
count++
console.log('图片信息解析成功:', count)
const buffer = await response.arrayBuffer()
const uint8Array = new Uint8Array(buffer)
let item = await imageSize(uint8Array)
return {
src: p,
width: item.width,
height: item.height,
// 图片类型
type: mime.getType(item.type)
}
})
})
try {
let result = await Promise.all(allImagePromise)
// 过滤掉 svg这样小的图片, 一般不会作为结构化数据
return result.filter(p => p.type !== 'image/svg+xml')
} catch (error) {
console.log('图片宽高解析失败')
console.log(error)
}
}
/**
* 包装成 application/ld+json 结构化数据
* @param {*} list
* @returns
*/
const wrapJsonLd = (list) => {
return list.map((item) => {
return {
'@context': 'https://schema.org',
'@type': 'ImageObject',
url: item.src,
width: item.width,
height: item.height,
encodingFormat: `${item.type}`
}
})
}
const start = async () => {
let text = await getHTMLText()
if (!text) {
console.log('网站的内容没有解析到')
return
}
let imageList = resolveImage(text)
// 需要去补全 域名
imageList = imageList.map((item) => {
return item.startsWith('http') ? item : host + item
})
// 解析图片的宽高
let imageSizeList = await resolveImageSize(imageList)
// 包装 application/ld+json 结构化数据
let jsonLd = wrapJsonLd(imageSizeList)
fs.writeFile('./web-application.json', JSON.stringify(jsonLd, null, 2), (err) => {
if (err) {
console.log('结构化数据写入失败')
console.log(err)
return
}
console.log('结构化数据写入成功')
})
}
start()
结果
[
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/480x270/cc402e6795/batch_mode.webp",
"width": 480,
"height": 270,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1599x900/a67bb8b540/process_hundreds_of_product_images_using_the_photoroom_api_m_.webp",
"width": 1599,
"height": 900,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Fobject-box.webp&w=640&q=75",
"width": 513,
"height": 574,
"encodingFormat": "image/png"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Flaptop.webp&w=1080&q=75",
"width": 992,
"height": 1280,
"encodingFormat": "image/png"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://www.photoroom.com//_next/image?url=%2Fimg%2Fhome%2Flaptop.webp&w=1080&q=75",
"width": 992,
"height": 1280,
"encodingFormat": "image/png"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/87fe0b04a9/shopify.webp",
"width": 280,
"height": 158,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/b5863f3ad6/netflix.webp",
"width": 280,
"height": 158,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/00ec587061/warner-bros.webp",
"width": 280,
"height": 158,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/2f44770ec6/bulgari.webp",
"width": 280,
"height": 158,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/dfa9052663/wolt.webp",
"width": 280,
"height": 158,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/420x237/8b250b18ca/daiso.webp",
"width": 420,
"height": 237,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/420x237/4ddbfa3ef8/hennessy.webp",
"width": 420,
"height": 237,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/280x158/834e23c71b/klarna.webp",
"width": 224,
"height": 122,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1048x764/424f986791/instant_backgrounds_examples.webp",
"width": 1048,
"height": 764,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/2096x1572/3240e37ea7/ai_expand.webp",
"width": 2096,
"height": 1572,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1048x786/fdcad8feb6/ebay_2.webp",
"width": 1048,
"height": 786,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/e422e0c6b2/logo_free.webp",
"width": 96,
"height": 96,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/e422e0c6b2/logo_free.webp",
"width": 96,
"height": 96,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/96x96/18116b9633/logo_enterprise.webp",
"width": 96,
"height": 96,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/7c73842d7b/sebastian-pilch.webp",
"width": 80,
"height": 80,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/cd03d1c011/sue-darte.webp",
"width": 80,
"height": 80,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/80x80/82f8c1c75b/jacob-p.webp",
"width": 80,
"height": 80,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://www.photoroom.com//_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fphone-static.ff9096fe.webp&w=3840&q=75",
"width": 440,
"height": 802,
"encodingFormat": "image/png"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://www.photoroom.com//_next/image?url=%2F_next%2Fstatic%2Fmedia%2Flaptop.ffe2a6e2.webp&w=3840&q=75",
"width": 496,
"height": 640,
"encodingFormat": "image/png"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/4e54b928ef/remove_background.webp",
"width": 1200,
"height": 800,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/85085e1807/ai_backgrounds_after_.webp",
"width": 1200,
"height": 800,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/1fb223a626/blur_background_after_.webp",
"width": 1200,
"height": 800,
"encodingFormat": "image/webp"
},
{
"@context": "https://schema.org",
"@type": "ImageObject",
"url": "https://storyblok-cdn.photoroom.com/f/191576/1200x800/88361a19c7/retouch_before_.webp",
"width": 1200,
"height": 800,
"encodingFormat": "image/webp"
}
]
生成的结果,直接可以放在 validator.schema.org/ 上进行验证;
✿✿ヽ(°▽°)ノ✿ 全部验证通过。😊😊😊