介绍:OCR识别应用
步骤一:引入依赖
<dashscope.sdk.version>2.18.4</dashscope.sdk.version>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>dashscope-sdk-java</artifactId>
<version>${dashscope.sdk.version}</version>
</dependency>
步骤二:配置
spring:
application:
name: RuoYi-Vue-Plus
ai:
dashscope:
api-key: 个人key
ocr:
dashscope:
model: qwen-vl-ocr-latest
步骤三:字段提取接口
[1]ImageController
/**
* 字段提取
*
* @param imageRequestBo
* @return
* @throws Exception
*/
@PostMapping("/textExtractImage")
public R<Void> textExtractImage(@RequestBody ImageRequestBo imageRequestBo) throws Exception {
return toAjax(imageService.textExtractImage(imageRequestBo));
}
[2]ImageService
public interface ImageService {
/**
* 字段提取
*
* @param imageRequestBo
* @return
*/
int textExtractImage(ImageRequestBo imageRequestBo) throws Exception;
}
[3]ImageServiceImpl
@Service
@Slf4j
public class ImageServiceImpl implements ImageService {
@Autowired
private ModelPropertyConfig modelPropertyConfig;
/**
* 字段提取
*
* @param imageRequestBo
* @return
*/
@Override
public int textExtractImage(ImageRequestBo imageRequestBo) throws Exception {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", imageRequestBo.getImageUrl());
// 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
map.put("max_pixels", "6422528");
// 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
map.put("min_pixels", "3136");
// 开启图像自动转正功能
map.put("enable_rotate", true);
MultiModalMessage userMessage = MultiModalMessage.builder()
.role(Role.USER.getValue())
.content(Arrays.asList(
map,
Collections.singletonMap("text", imageRequestBo.getPromptText())))
.build();
// 创建主JSON对象
JsonObject resultSchema = new JsonObject();
resultSchema.addProperty("销售方名称", "");
resultSchema.addProperty("购买方名称", "");
resultSchema.addProperty("不含税价", "");
resultSchema.addProperty("组织机构代码", "");
resultSchema.addProperty("发票代码", "");
// 配置内置的OCR任务
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
.taskConfig(OcrOptions.TaskConfig.builder()
.resultSchema(resultSchema)
.build())
.build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(modelPropertyConfig.getCommonApiKey())
.model(modelPropertyConfig.getOcrModelName())
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
log.info("result: {}", JSONUtil.toJsonStr(result));
return 1;
}
}
[4]ModelPropertyConfig
@Data
@Component
public class ModelPropertyConfig {
/**
* 通用key
*/
@Value("${spring.ai.dashscope.api-key}")
private String commonApiKey;
/**
* OCR模型
*/
@Value("${ocr.dashscope.model}")
private String ocrModelName;
}
[5]ImageRequestBo
@Data
public class ImageRequestBo {
/**
* 图片url
*/
private String imageUrl;
/**
* 提示词
*/
private String promptText;
}
步骤四:接口测试
发票地址:https://prism-test-data.oss-cn-hangzhou.aliyuncs.com/image/car_invoice/car-invoice-img00040.jpg
日志打印
result: {"requestId":"d2a7cea2-31c2-9f71-8742-295742f7c3fe","usage":{"inputTokens":1173,"outputTokens":103,"totalTokens":1276,"imageTokens":1001,"inputTokensDetails":{"textTokens":172,"imageTokens":1001},"outputTokensDetails":{"textTokens":103}},"output":{"choices":[{"finishReason":"stop","message":{"role":"assistant","content":[{"text":"```json\n{\n \"不含税价\": \"230769.23\",\n \"发票代码\": \"142011726001\",\n \"组织机构代码\": \"\",\n \"购买方名称\": \"蔡应时\",\n \"销售方名称\": \"湖北中基汽车销售服务有限公司\",\n \"销售方税号\": \"91420000670389609X\"\n}\n```","ocr_result":{"kv_result":{"销售方名称":"湖北中基汽车销售服务有限公司","不含税价":"230769.23","购买方名称":"蔡应时","发票代码":"142011726001","组织机构代码":""}}}]}}]}}
经过验证,字段内容与值与发票一致!
本人正在打造技术交流群,欢迎志同道合的朋友一起探讨,一起努力,通过自己的努力,在技术岗位这条道路上走得更远。QQ群号:925317809 备注:技术交流 即可通过!
加入技术群可以获取资料,含AI资料、Spring AI中文文档等,等你加入~