Page MenuHomePhabricator
Paste P86468

Docker run
ActivePublic

Authored by gkyziridis on Dec 9 2025, 10:47 AM.
Referenced Files
F70958890: Docker run
Dec 9 2025, 10:59 AM
F70958842: Docker run
Dec 9 2025, 10:55 AM
F70958727: Docker run
Dec 9 2025, 10:47 AM
Subscribers
$ docker run --rm -it --network=host torch_rocm3
Python 3.11.2 (main, Apr 28 2025, 14:11:48) [GCC 12.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch.nn as nn
>>> try:
... import torch._dynamo
... torch._dynamo.config.suppress_errors = True
... except Exception:
... pass
...
>>> DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> MODEL_NAME = "answerdotai/ModernBERT-base"
>>> print("Device =", DEVICE)
Device = cpu
>>> tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
>>> encoder = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
>>> head = nn.Linear(encoder.config.hidden_size, 2).to(DEVICE)
>>> criterion = nn.CrossEntropyLoss()
>>> optimizer = torch.optim.AdamW(
... list(encoder.parameters()) + list(head.parameters()),
... lr=1e-4
... )
>>> texts = ["hello world", "modernbert rocm test"]
>>> batch = tokenizer(texts, padding=True, return_tensors="pt").to(DEVICE)
>>> labels = torch.tensor([0, 1], device=DEVICE)
>>> encoder.train()
ModernBertModel(
(embeddings): ModernBertEmbeddings(
(tok_embeddings): Embedding(50368, 768, padding_idx=50283)
(norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(drop): Dropout(p=0.0, inplace=False)
)
(layers): ModuleList(
(0): ModernBertEncoderLayer(
(attn_norm): Identity()
(attn): ModernBertAttention(
(Wqkv): Linear(in_features=768, out_features=2304, bias=False)
(rotary_emb): ModernBertRotaryEmbedding()
(Wo): Linear(in_features=768, out_features=768, bias=False)
(out_drop): Identity()
)
(mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): ModernBertMLP(
(Wi): Linear(in_features=768, out_features=2304, bias=False)
(act): GELUActivation()
(drop): Dropout(p=0.0, inplace=False)
(Wo): Linear(in_features=1152, out_features=768, bias=False)
)
)
(1-21): 21 x ModernBertEncoderLayer(
(attn_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): ModernBertAttention(
(Wqkv): Linear(in_features=768, out_features=2304, bias=False)
(rotary_emb): ModernBertRotaryEmbedding()
(Wo): Linear(in_features=768, out_features=768, bias=False)
(out_drop): Identity()
)
(mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): ModernBertMLP(
(Wi): Linear(in_features=768, out_features=2304, bias=False)
(act): GELUActivation()
(drop): Dropout(p=0.0, inplace=False)
(Wo): Linear(in_features=1152, out_features=768, bias=False)
)
)
)
(final_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
>>> head.train()
Linear(in_features=768, out_features=2, bias=True)
>>> outputs = encoder(**batch)
>>> pooled = outputs.last_hidden_state[:, 0, :] # CLS token
>>> logits = head(pooled)
>>> loss = criterion(logits, labels)
>>> print("Loss =", loss.item())
Loss = 0.7854277491569519
>>> loss.backward()
>>> optimizer.step()
# Inference
>>> encoder.eval()
ModernBertModel(
(embeddings): ModernBertEmbeddings(
(tok_embeddings): Embedding(50368, 768, padding_idx=50283)
(norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(drop): Dropout(p=0.0, inplace=False)
)
(layers): ModuleList(
(0): ModernBertEncoderLayer(
(attn_norm): Identity()
(attn): ModernBertAttention(
(Wqkv): Linear(in_features=768, out_features=2304, bias=False)
(rotary_emb): ModernBertRotaryEmbedding()
(Wo): Linear(in_features=768, out_features=768, bias=False)
(out_drop): Identity()
)
(mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): ModernBertMLP(
(Wi): Linear(in_features=768, out_features=2304, bias=False)
(act): GELUActivation()
(drop): Dropout(p=0.0, inplace=False)
(Wo): Linear(in_features=1152, out_features=768, bias=False)
)
)
(1-21): 21 x ModernBertEncoderLayer(
(attn_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): ModernBertAttention(
(Wqkv): Linear(in_features=768, out_features=2304, bias=False)
(rotary_emb): ModernBertRotaryEmbedding()
(Wo): Linear(in_features=768, out_features=768, bias=False)
(out_drop): Identity()
)
(mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): ModernBertMLP(
(Wi): Linear(in_features=768, out_features=2304, bias=False)
(act): GELUActivation()
(drop): Dropout(p=0.0, inplace=False)
(Wo): Linear(in_features=1152, out_features=768, bias=False)
)
)
)
(final_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
>>> head.eval()
Linear(in_features=768, out_features=2, bias=True)
>>> with torch.no_grad():
... outputs = encoder(**batch)
... pooled = outputs.last_hidden_state[:, 0, :]
... logits = head(pooled)
>>> all_zero = torch.allclose(logits, torch.zeros_like(logits))
>>> has_nan = torch.isnan(logits).any()
>>> print("\n===== RESULT =====")
>>> print("All-zero logits? :", bool(all_zero))
>>> print("Contains NaNs? :", bool(has_nan))
>>> print("Logits:", logits.cpu())
===== RESULT =====
All-zero logits? : False
Contains NaNs? : False
Logits: tensor([[-0.8974, 0.5434],
[ 0.5851, -0.1151]])