RSCLIP Collections
Collection
A collection of Remote Sensing CLIP models in both huggingface/transformers and huggingface/diffusers text encoder production ready style
•
15 items
•
Updated
This model is a mirror/redistribution of the original RS-M-CLIP model.
RS-M-CLIP (Remote Sensing Multilingual CLIP) is a multilingual vision-language foundation model for remote sensing. It is based on the CLIP architecture with a ViT-B-32 vision encoder and XLM-RoBERTa text encoder, trained on remote sensing image-caption pairs across multiple languages. The model supports English, Portuguese, Spanish, French, German, Dutch, Italian, Chinese, Korean, and Russian.
transformers
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load model and processor
model = CLIPModel.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
processor = CLIPProcessor.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
# Load and process image
image = Image.open("path/to/your/image.jpg")
inputs = processor(
text=["a photo of a building", "a photo of vegetation", "a photo of water"],
images=image,
return_tensors="pt",
padding=True
)
# Get image-text similarity scores
with torch.inference_mode():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(f"Similarity scores: {probs}")
Zero-shot image classification (multilingual):
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
processor = CLIPProcessor.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
# Define candidate labels in multiple languages
candidate_labels = [
"a satellite image of urban area", # English
"uma imagem de satélite de área urbana", # Portuguese
"una imagen satelital de área urbana", # Spanish
"une image satellite de zone urbaine", # French
]
image = Image.open("path/to/your/image.jpg")
inputs = processor(
text=candidate_labels,
images=image,
return_tensors="pt",
padding=True
)
with torch.inference_mode():
outputs = model(**inputs)
probs = outputs.logits_per_image.softmax(dim=1)
# Get the predicted label
predicted_idx = probs.argmax().item()
print(f"Predicted label: {candidate_labels[predicted_idx]}")
print(f"Confidence: {probs[0][predicted_idx]:.4f}")
Extracting individual features:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
processor = CLIPProcessor.from_pretrained("BiliSakura/RS-M-CLIP-ViT-B-32")
# Get image features only
image = Image.open("path/to/your/image.jpg")
image_inputs = processor(images=image, return_tensors="pt")
with torch.inference_mode():
image_features = model.get_image_features(**image_inputs)
# Get text features only
text_inputs = processor(
text=["a satellite image of urban area"],
return_tensors="pt",
padding=True,
truncation=True
)
with torch.inference_mode():
text_features = model.get_text_features(**text_inputs)
print(f"Image features shape: {image_features.shape}")
print(f"Text features shape: {text_features.shape}")
diffusers
This model's text encoder can be used with Stable Diffusion and other diffusion models:
from diffusers import StableDiffusionPipeline
from transformers import CLIPTextModel, CLIPTokenizer
import torch
# Load the text encoder and tokenizer
text_encoder = CLIPTextModel.from_pretrained(
"BiliSakura/RS-M-CLIP-ViT-B-32/diffusers",
subfolder="text_encoder",
torch_dtype=torch.float16
)
tokenizer = CLIPTokenizer.from_pretrained(
"BiliSakura/RS-M-CLIP-ViT-B-32"
)
# Encode text prompt (supports multiple languages)
prompt = "a satellite image of a city with buildings and roads"
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=77,
truncation=True,
return_tensors="pt"
)
with torch.inference_mode():
text_outputs = text_encoder(text_inputs.input_ids)
text_embeddings = text_outputs.last_hidden_state
print(f"Text embeddings shape: {text_embeddings.shape}")
Using with Stable Diffusion:
from diffusers import StableDiffusionPipeline
import torch
# Load pipeline with custom text encoder
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
text_encoder=text_encoder,
tokenizer=tokenizer,
torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
# Generate image (supports multilingual prompts)
prompt = "a high-resolution satellite image of urban area"
image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
image.save("generated_image.png")
If you use this model in your research, please cite the original work:
@article{silva2024multilingual,
title = {Multilingual Vision-Language Pre-training for the Remote Sensing Domain},
author = {Silva, Jo{\~a}o Daniel and Magalh{\~a}es, Jo{\~a}o and Tuia, Devis and Martins, Bruno},
year = {2024},
journal = {arXiv:2410.23370}
}