Hybrid Global-Local Representation with Augmented Spatial Guidance for Zero-Shot Referring Image Segmentation
# Create&Activate conda env
conda create -n hybridgl python=3.10
conda activate hybridgl
# Install Pytorch 2.0.1+cu117 version
conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
# Install spacy for language processing
conda install -c conda-forge spacy=3.7.6 einops=0.8.0
pip install pydantic==2.9.1
python -m spacy download en_core_web_lg
# Install required packages
pip install opencv-python==4.10.0.84 matplotlib==3.9.2 markupsafe==2.1.5 h5py scikit-image==0.24.0 pycocotools==2.0.8
# Install GEM, make sure the open_clip version is 2.24.0
pip install gem_torch
pip install open_clip_torch==2.24.0
cd third_parth
cd modified_CLIP
pip install -e .
cd ..
cd segment-anything
pip install -e .
cd ../..
mkdir checkpoints
cd checkpoints
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
we follow dataset setup in Zero-shot-RIS
In "./refer/data/images/mscoco/images" path
wget http://images.cocodataset.org/zips/train2014.zip
unzip train2014In "./refer/data" path
# RefCOCO
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip
unzip refcoco.zip
# RefCOCO+
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip
unzip refcoco+.zip
# RefCOCOg
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip
unzip refcocog.zipCUDA_VISIBLE_DEVICES=0 /path2env/bin/python Hybridgl_main.py --dataset refcoco(+/g) --split testA/testB/test/val --fusion_mode G2L/L2G/G2L&L2G
CUDA_VISIBLE_DEVICES=0 /path2env/bin/python Hybridgl_main.py --dataset refcocog --split val --fusion_mode G2L
...
Our code is based on the following open-source projects: CLIP, Zero-shot-RIS, GEM. we sincerely thanks to the developers of these resources!
bibtex
@InProceedings{Liu_2025_CVPR,
author = {Liu, Ting and Li, Siyuan},
title = {Hybrid Global-Local Representation with Augmented Spatial Guidance for Zero-Shot Referring Image Segmentation},
booktitle = {Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR)},
month = {June},
year = {2025},
pages = {29634-29643}
}