It can generate high-accuracy captions for a wide range of genres such as real photos, anime, and digital art. Above all, since it handles NSFW images without censoring, it is popular as a "caption-specific model" even now that higher-performance general-purpose MLLMs have appeared.
Because it is based on LLaVA-based multimodal models, it cannot be said to be lightweight compared to taggers, but it has the ease of being easy to incorporate into a local image generation pipeline as a ComfyUI node.
Generates a caption from the input image.
{
"id": "ded66799-2c80-4180-a0ff-fa576917dc55",
"revision": 0,
"last_node_id": 8,
"last_link_id": 5,
"nodes": [
{
"id": 3,
"type": "JC_ExtraOptions",
"pos": [
-23.901104900351015,
618.0458046497705
],
"size": [
400,
736
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "extra_options",
"type": "JOYCAPTION_EXTRA_OPTIONS",
"links": [
1
]
}
],
"properties": {
"cnr_id": "comfyui-joycaption",
"ver": "2.0.2",
"Node name for S&R": "JC_ExtraOptions"
},
"widgets_values": [
false,
true,
true,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
false,
""
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 5,
"type": "PreviewAny",
"pos": [
682.6804194740781,
537.0944700512295
],
"size": [
335.4273478587527,
226.0551465063861
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "source",
"type": "*",
"link": 3
}
],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.76",
"Node name for S&R": "PreviewAny"
},
"widgets_values": [
null,
null,
null
]
},
{
"id": 4,
"type": "LoadImage",
"pos": [
-279.2074297165498,
537.0944700512295
],
"size": [
225.89825994318176,
407.63636363636374
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
2
]
},
{
"name": "MASK",
"type": "MASK",
"links": null
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.76",
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"download (1).jpg",
"image"
]
},
{
"id": 1,
"type": "JC",
"pos": [
405.50695997266604,
537.0944700512295
],
"size": [
246.59217840204735,
174
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "image",
"type": "IMAGE",
"link": 2
},
{
"name": "extra_options",
"shape": 7,
"type": "JOYCAPTION_EXTRA_OPTIONS",
"link": 1
}
],
"outputs": [
{
"name": "STRING",
"type": "STRING",
"links": [
3
]
}
],
"properties": {
"cnr_id": "comfyui-joycaption",
"ver": "2.0.2",
"Node name for S&R": "JC"
},
"widgets_values": [
"joycaption-beta-one",
"Balanced (8-bit)",
"Straightforward",
"medium",
"Keep in Memory"
],
"color": "#232",
"bgcolor": "#353"
}
],
"links": [
[
1,
3,
0,
1,
1,
"JOYCAPTION_EXTRA_OPTIONS"
],
[
2,
4,
0,
1,
0,
"IMAGE"
],
[
3,
1,
0,
5,
0,
"*"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 0.9090909090909091,
"offset": [
462.80742971654985,
-391.99447005122937
]
},
"frontendVersion": "1.34.6",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"version": 0.4
}