# Demonstration: Uniform temporal sampling
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Simulate a 30-second video at 30fps
original_fps = 30
duration_sec = 30
total_frames = original_fps * duration_sec  # 900 frames

# Sample at different rates
sample_rates = [1, 2, 4, 8]  # fps
colors = ['#2ecc71', '#3498db', '#e74c3c', '#f39c12']

fig, axes = plt.subplots(len(sample_rates), 1, figsize=(14, 6), sharex=True)
fig.suptitle('Uniform Temporal Sampling at Different FPS Rates', fontsize=14, fontweight='bold')

for ax, fps, color in zip(axes, sample_rates, colors):
    n_sampled = fps * duration_sec
    sampled_times = np.linspace(0, duration_sec, n_sampled, endpoint=False)

    # Show all original frames as light ticks
    all_times = np.linspace(0, duration_sec, total_frames, endpoint=False)
    ax.eventplot([all_times], colors=['#ddd'], lineoffsets=0, linelengths=0.3)

    # Show sampled frames
    ax.eventplot([sampled_times], colors=[color], lineoffsets=0, linelengths=0.8, linewidths=1.5)
    ax.set_ylabel(f'{fps} fps\n({n_sampled} frames)', fontsize=9)
    ax.set_ylim(-0.6, 0.6)
    ax.set_yticks([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

axes[-1].set_xlabel('Time (seconds)', fontsize=11)
plt.tight_layout()
plt.show()

print(f"Original video: {total_frames} frames at {original_fps} fps")
for fps in sample_rates:
    n = fps * duration_sec
    print(f"  Sampled at {fps} fps: {n} frames ({total_frames/n:.0f}× reduction)")

# Demonstration: Token budget allocation across frames

def compute_frame_tokens(height, width, patch_size=14, merge_factor=2):
    """Compute visual tokens for a single frame after ViT + 2x2 merging."""
    h_patches = height // patch_size
    w_patches = width // patch_size
    # After 2x2 merging
    merged_h = h_patches // merge_factor
    merged_w = w_patches // merge_factor
    return merged_h * merged_w

# Scenario: 60-second video, different sampling strategies
duration = 60  # seconds
token_budget = 16384

scenarios = {
    'High FPS, Low Res\n(4 fps, 224×224)': {'fps': 4, 'h': 224, 'w': 224},
    'Medium FPS, Medium Res\n(2 fps, 448×448)': {'fps': 2, 'h': 448, 'w': 448},
    'Low FPS, High Res\n(0.5 fps, 896×896)': {'fps': 0.5, 'h': 896, 'w': 896},
}

fig, ax = plt.subplots(figsize=(10, 5))
x_pos = np.arange(len(scenarios))
bar_colors = ['#3498db', '#2ecc71', '#e74c3c']

for i, (label, cfg) in enumerate(scenarios.items()):
    n_frames = int(cfg['fps'] * duration)
    # 3D conv merges pairs of frames
    effective_frames = n_frames // 2
    tokens_per_frame = compute_frame_tokens(cfg['h'], cfg['w'])
    total_tokens = effective_frames * tokens_per_frame

    bar = ax.bar(i, total_tokens, color=bar_colors[i], alpha=0.8, edgecolor='white', linewidth=2)
    ax.text(i, total_tokens + 300, f"{n_frames} frames\n{tokens_per_frame} tok/fr\n= {total_tokens:,} total",
            ha='center', va='bottom', fontsize=9, fontweight='bold')

ax.axhline(y=token_budget, color='red', linestyle='--', linewidth=2, label=f'Token budget ({token_budget:,})')
ax.set_xticks(x_pos)
ax.set_xticklabels(scenarios.keys(), fontsize=9)
ax.set_ylabel('Total Visual Tokens', fontsize=11)
ax.set_title('Token Budget Trade-offs: FPS vs Resolution (60s video)', fontsize=13, fontweight='bold')
ax.legend(fontsize=10)
ax.set_ylim(0, max(20000, token_budget * 1.5))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()

# ============================================================================
# SOURCE CODE WALKTHROUGH: Frame Sampling in qwen-vl-utils
# From: qwen-vl-utils/src/qwen_vl_utils/vision_process.py
# https://github.com/QwenLM/Qwen3-VL/tree/main/qwen-vl-utils
# ============================================================================

# --- Global constants that control frame sampling ---
# These are the defaults set at the top of vision_process.py:

FPS = 2.0               # Default sampling rate: 2 frames per second
FRAME_FACTOR = 2         # Frames must be a multiple of 2 (for 3D conv temporal depth)
FPS_MIN_FRAMES = 4       # Minimum frames to sample from any video
FPS_MAX_FRAMES = 768     # Maximum frames to sample
SPATIAL_MERGE_SIZE = 2   # The 2×2 spatial merge factor


# --- Step 1: smart_nframes() — decides HOW MANY frames to sample ---
# This is the core function that computes the target number of frames.

def smart_nframes(ele, total_frames, video_fps):
    """Calculate the number of frames for video used for model inputs.

    Args:
        ele: dict with optional 'fps', 'nframes', 'min_frames', 'max_frames' keys
        total_frames: original total frames in the video
        video_fps: original fps of the video
    Returns:
        int: number of frames to extract (always a multiple of FRAME_FACTOR=2)
    """
    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"

    if "nframes" in ele:
        # User explicitly set number of frames
        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
    else:
        # ⭐ DEFAULT PATH: Sample based on FPS ratio
        fps = ele.get("fps", FPS)                              # default: 2.0 fps
        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)   # default: 4
        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)

        #  ⭐ KEY CALCULATION: desired frames = total_frames / original_fps × target_fps
        #  e.g. 60s video at 30fps: 1800 frames / 30 × 2.0 = 120 frames
        nframes = total_frames / video_fps * fps

        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
        nframes = floor_by_factor(nframes, FRAME_FACTOR)       # round down to multiple of 2

    return int(nframes)


# --- Step 2: The video reader selects WHICH frames via linspace ---
# All three backends (decord, torchvision, torchcodec) use the same pattern:

def _read_video_decord_simplified(ele):
    """Simplified version of _read_video_decord showing the frame selection logic."""
    import decord
    vr = decord.VideoReader(ele["video"])
    total_frames, video_fps = len(vr), vr.get_avg_fps()

    # Get valid frame range (handles video_start/video_end clipping)
    start_frame, end_frame, total_frames = calculate_video_frame_range(ele, total_frames, video_fps)

    # Decide how many frames
    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)

    # ⭐ KEY: Uniformly space `nframes` indices across the valid range
    # This is where the actual frame SELECTION happens
    idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()

    # Read only the selected frames
    video = vr.get_batch(idx)

    # ⭐ Calculate the effective sample FPS (used later for temporal IDs)
    sample_fps = nframes / max(total_frames, 1e-6) * video_fps

    # Return video tensor + metadata (including which frames were selected)
    video_metadata = dict(
        fps=video_fps,
        frames_indices=idx,              # ← the actual frame indices selected
        total_num_frames=total_frames,
        video_backend="decord",
    )
    return video, video_metadata, sample_fps

print("✅ Source code walkthrough loaded — see comments above for how frame sampling works")

# ============================================================================
# SOURCE CODE WALKTHROUGH: Token Budget & Resolution in fetch_video()
# From: qwen-vl-utils/src/qwen_vl_utils/vision_process.py
# ============================================================================

# After frames are selected, fetch_video() enforces the token budget by
# adjusting the spatial resolution of each frame.

def fetch_video_simplified(ele, image_patch_size=14):
    """Simplified fetch_video showing the token budget enforcement logic."""

    image_factor = image_patch_size * SPATIAL_MERGE_SIZE      # 14 * 2 = 28 (or 16 * 2 = 32 for Qwen3-VL)
    VIDEO_FRAME_MIN_PIXELS = 128 * image_factor * image_factor  # Min tokens × area per token
    VIDEO_FRAME_MAX_PIXELS = 768 * image_factor * image_factor  # Max tokens × area per token

    # 1. Read video frames (via decord/torchvision/torchcodec)
    video, video_metadata, sample_fps = "_read_video_backend(ele)"

    nframes = video.shape[0]  # number of sampled frames
    height, width = video.shape[2], video.shape[3]

    # 2. Calculate resolution constraints based on token budget
    min_pixels = ele.get("min_pixels", VIDEO_FRAME_MIN_PIXELS)

    # ⭐ MODEL_SEQ_LEN controls the overall context budget (default 128K)
    MODEL_SEQ_LEN = 128000
    total_pixels = ele.get("total_pixels", MODEL_SEQ_LEN * image_factor * image_factor * 0.9)

    # ⭐ KEY: max_pixels per frame shrinks as nframes increases
    # This is the FPS vs resolution trade-off in action!
    FRAME_FACTOR = 2
    max_pixels = max(
        min(VIDEO_FRAME_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
        int(min_pixels * 1.05)
    )

    # 3. Resize frames to fit within the computed pixel budget
    # resized_height, resized_width = smart_resize(height, width, min_pixels=min_pixels, max_pixels=max_pixels)
    # video = resize(video, [resized_height, resized_width])

    return "video tensor + metadata"


# Let's see the actual numbers for different scenarios:
print("Token budget examples (image_patch_size=14, merge=2, factor=28):")
print("=" * 70)

image_factor = 14 * 2  # Qwen2.5-VL
for nframes in [10, 30, 60, 120, 240]:
    MODEL_SEQ_LEN = 128000
    total_pixels = MODEL_SEQ_LEN * image_factor * image_factor * 0.9
    VIDEO_FRAME_MAX_PIXELS = 768 * image_factor * image_factor
    VIDEO_FRAME_MIN_PIXELS = 128 * image_factor * image_factor
    max_pixels_per_frame = max(
        min(VIDEO_FRAME_MAX_PIXELS, total_pixels / nframes * 2),
        int(VIDEO_FRAME_MIN_PIXELS * 1.05)
    )
    # Approximate resolution (square)
    approx_side = int(max_pixels_per_frame ** 0.5)
    # Tokens per frame = pixels / (patch_size * merge)^2
    tokens_per_frame = max_pixels_per_frame // (image_factor * image_factor)

    print(f"  {nframes:>3} frames → max {max_pixels_per_frame:>10,} px/frame "
          f"(~{approx_side}×{approx_side}) → ~{tokens_per_frame} tokens/frame "
          f"→ {nframes * tokens_per_frame // 2:,} total tokens (after 3D conv)")

print()
print("Token budget examples (image_patch_size=16, merge=2, factor=32) — Qwen3-VL:")
print("=" * 70)

image_factor = 16 * 2  # Qwen3-VL
for nframes in [10, 30, 60, 120, 240]:
    total_pixels = MODEL_SEQ_LEN * image_factor * image_factor * 0.9
    VIDEO_FRAME_MAX_PIXELS = 768 * image_factor * image_factor
    VIDEO_FRAME_MIN_PIXELS = 128 * image_factor * image_factor
    max_pixels_per_frame = max(
        min(VIDEO_FRAME_MAX_PIXELS, total_pixels / nframes * 2),
        int(VIDEO_FRAME_MIN_PIXELS * 1.05)
    )
    approx_side = int(max_pixels_per_frame ** 0.5)
    tokens_per_frame = max_pixels_per_frame // (image_factor * image_factor)

    print(f"  {nframes:>3} frames → max {max_pixels_per_frame:>10,} px/frame "
          f"(~{approx_side}×{approx_side}) → ~{tokens_per_frame} tokens/frame "
          f"→ {nframes * tokens_per_frame // 2:,} total tokens (after 3D conv)")

# Visualization: Frame → Patches → 3D Tubes → Merged Tokens

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

# Step 1: Original frames (pair)
ax = axes[0]
ax.set_title('Step 1: Frame Pair\n(sampled from video)', fontsize=10, fontweight='bold')
# Draw two frames
for offset, label, color in [(0, 'Frame t', '#3498db'), (0.55, 'Frame t+1', '#2ecc71')]:
    rect = mpatches.FancyBboxPatch((0.05, offset), 0.9, 0.4,
                                     boxstyle='round,pad=0.02',
                                     facecolor=color, alpha=0.3, edgecolor=color, linewidth=2)
    ax.add_patch(rect)
    ax.text(0.5, offset + 0.2, label, ha='center', va='center', fontsize=10, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(-0.05, 1.05)
ax.axis('off')

# Step 2: Patch extraction
ax = axes[1]
ax.set_title('Step 2: Patch Extraction\n(patch_size=14)', fontsize=10, fontweight='bold')
grid_size = 8  # simplified
for i in range(grid_size):
    for j in range(grid_size):
        color = plt.cm.Blues(0.3 + 0.5 * ((i + j) % 2))
        rect = mpatches.Rectangle((j/grid_size, i/grid_size), 1/grid_size, 1/grid_size,
                                   facecolor=color, edgecolor='white', linewidth=0.5)
        ax.add_patch(rect)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xlabel(f'{grid_size}×{grid_size} = {grid_size**2} patches', fontsize=9)
ax.set_xticks([])
ax.set_yticks([])

# Step 3: 3D Conv (temporal merge)
ax = axes[2]
ax.set_title('Step 3: 3D Conv\n(temporal depth=2)', fontsize=10, fontweight='bold')
grid_size = 8
for i in range(grid_size):
    for j in range(grid_size):
        color = plt.cm.Purples(0.3 + 0.5 * ((i + j) % 2))
        rect = mpatches.Rectangle((j/grid_size, i/grid_size), 1/grid_size, 1/grid_size,
                                   facecolor=color, edgecolor='white', linewidth=0.5)
        ax.add_patch(rect)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xlabel(f'{grid_size}×{grid_size} = {grid_size**2} tubes\n(2 frames → 1 temporal slot)', fontsize=9)
ax.set_xticks([])
ax.set_yticks([])

# Step 4: 2×2 spatial merge
ax = axes[3]
ax.set_title('Step 4: 2×2 Merge\n(spatial compression)', fontsize=10, fontweight='bold')
merged_size = grid_size // 2
for i in range(merged_size):
    for j in range(merged_size):
        color = plt.cm.Oranges(0.3 + 0.5 * ((i + j) % 2))
        rect = mpatches.Rectangle((j/merged_size, i/merged_size), 1/merged_size, 1/merged_size,
                                   facecolor=color, edgecolor='white', linewidth=1)
        ax.add_patch(rect)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xlabel(f'{merged_size}×{merged_size} = {merged_size**2} tokens\n(+ 2 delimiter tokens = {merged_size**2 + 2})', fontsize=9)
ax.set_xticks([])
ax.set_yticks([])

plt.suptitle('Vision Encoder Pipeline: Frames → Visual Tokens', fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Visualization: M-RoPE position ID assignment

fig, axes = plt.subplots(3, 1, figsize=(16, 8), sharex=True)
component_names = ['Temporal (t)', 'Height (h)', 'Width (w)']
component_colors = ['#e74c3c', '#2ecc71', '#3498db']

# Build a sequence: [text_tokens] [image_tokens] [text_tokens] [video_tokens] [text_tokens]
# Text: "Describe this image" (4 tokens)
# Image: 3×3 = 9 tokens (simplified)
# Text: "Now watch this video" (5 tokens)
# Video: 3 frames × 2×2 = 12 tokens (simplified)
# Text: "What happened?" (3 tokens)

segments = []
labels = []
bg_colors = []

# === TEXT 1 ===
n_text1 = 4
t_ids = list(range(0, n_text1))
h_ids = list(range(0, n_text1))
w_ids = list(range(0, n_text1))
segments.append(('Text', t_ids, h_ids, w_ids))
pos_offset = n_text1

# === IMAGE (3×3 grid, constant temporal) ===
img_h, img_w = 3, 3
t_ids_img = [pos_offset] * (img_h * img_w)  # constant temporal
h_ids_img = []
w_ids_img = []
for r in range(img_h):
    for c in range(img_w):
        h_ids_img.append(pos_offset + r)
        w_ids_img.append(pos_offset + c)
segments.append(('Image', t_ids_img, h_ids_img, w_ids_img))
pos_offset = max(max(t_ids_img), max(h_ids_img), max(w_ids_img)) + 1

# === TEXT 2 ===
n_text2 = 5
t_ids = list(range(pos_offset, pos_offset + n_text2))
h_ids = list(range(pos_offset, pos_offset + n_text2))
w_ids = list(range(pos_offset, pos_offset + n_text2))
segments.append(('Text', t_ids, h_ids, w_ids))
pos_offset += n_text2

# === VIDEO (3 frames, each 2×2) ===
vid_frames, vid_h, vid_w = 3, 2, 2
t_ids_vid = []
h_ids_vid = []
w_ids_vid = []
for f in range(vid_frames):
    for r in range(vid_h):
        for c in range(vid_w):
            t_ids_vid.append(pos_offset + f)
            h_ids_vid.append(pos_offset + r)
            w_ids_vid.append(pos_offset + c)
segments.append(('Video', t_ids_vid, h_ids_vid, w_ids_vid))
pos_offset = max(max(t_ids_vid), max(h_ids_vid), max(w_ids_vid)) + 1

# === TEXT 3 ===
n_text3 = 3
t_ids = list(range(pos_offset, pos_offset + n_text3))
h_ids = list(range(pos_offset, pos_offset + n_text3))
w_ids = list(range(pos_offset, pos_offset + n_text3))
segments.append(('Text', t_ids, h_ids, w_ids))

# Plot each component
seg_colors_map = {'Text': '#f0f0f0', 'Image': '#d5e8d4', 'Video': '#dae8fc'}
component_idx = {
    'Temporal (t)': lambda s: s[1],
    'Height (h)': lambda s: s[2],
    'Width (w)': lambda s: s[3],
}

for ax, comp_name, comp_color in zip(axes, component_names, component_colors):
    pos = 0
    for seg in segments:
        seg_type = seg[0]
        ids = component_idx[comp_name](seg)
        n = len(ids)

        # Background for segment
        ax.axvspan(pos - 0.4, pos + n - 0.6, color=seg_colors_map[seg_type], alpha=0.5)

        # Plot IDs
        ax.bar(range(pos, pos + n), ids, color=comp_color, alpha=0.8, edgecolor='white', width=0.8)

        # Label segment
        ax.text(pos + n/2 - 0.5, max(ids) + 1.5, seg_type, ha='center', fontsize=8,
                fontweight='bold', fontstyle='italic', alpha=0.6)

        pos += n

    ax.set_ylabel(comp_name, fontsize=11, fontweight='bold', color=comp_color)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

axes[-1].set_xlabel('Token Position in Sequence', fontsize=11)
axes[0].set_title('M-RoPE Position ID Assignment Across Modalities', fontsize=13, fontweight='bold')

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#f0f0f0', edgecolor='gray', label='Text'),
    Patch(facecolor='#d5e8d4', edgecolor='gray', label='Image'),
    Patch(facecolor='#dae8fc', edgecolor='gray', label='Video'),
]
axes[0].legend(handles=legend_elements, loc='upper left', fontsize=9)

plt.tight_layout()
plt.show()

# Comparison: 1D-RoPE vs M-RoPE max position IDs

num_frames_range = np.arange(10, 510, 10)
spatial_grid = 8  # 8x8 merged tokens per frame

# 1D-RoPE: position IDs are sequential
max_1d_ids = num_frames_range * spatial_grid * spatial_grid

# M-RoPE: max ID is max(temporal, height, width)
max_mrope_ids = np.maximum(num_frames_range, spatial_grid)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left: absolute comparison
ax1.plot(num_frames_range, max_1d_ids, 'r-', linewidth=2, label='1D-RoPE (max pos ID)')
ax1.plot(num_frames_range, max_mrope_ids, 'b-', linewidth=2, label='M-RoPE (max pos ID)')
ax1.fill_between(num_frames_range, max_mrope_ids, max_1d_ids, alpha=0.1, color='red')
ax1.set_xlabel('Number of Video Frames', fontsize=11)
ax1.set_ylabel('Maximum Position ID', fontsize=11)
ax1.set_title('Position ID Growth: 1D-RoPE vs M-RoPE', fontsize=12, fontweight='bold')
ax1.legend(fontsize=10)
ax1.set_yscale('log')
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.grid(True, alpha=0.3)

# Right: compression ratio
ratio = max_1d_ids / max_mrope_ids
ax2.plot(num_frames_range, ratio, 'g-', linewidth=2)
ax2.fill_between(num_frames_range, 1, ratio, alpha=0.2, color='green')
ax2.set_xlabel('Number of Video Frames', fontsize=11)
ax2.set_ylabel('Compression Ratio (1D / M-RoPE)', fontsize=11)
ax2.set_title('M-RoPE Position ID Compression Ratio', fontsize=12, fontweight='bold')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.grid(True, alpha=0.3)
ax2.annotate(f'At 500 frames: {ratio[-1]:.0f}× compression',
             xy=(500, ratio[-1]), xytext=(350, ratio[-1]*0.7),
             arrowprops=dict(arrowstyle='->', color='green'),
             fontsize=10, fontweight='bold', color='green')

plt.tight_layout()
plt.show()

# Visualization: Absolute time encoding in M-RoPE temporal IDs

fig, axes = plt.subplots(2, 1, figsize=(14, 7))

# Scenario: 10-second video with different sampling rates
video_duration = 10  # seconds

# Top plot: Fixed 2fps sampling
ax = axes[0]
ax.set_title('Fixed 2 FPS Sampling — Uniform Temporal ID Spacing', fontsize=11, fontweight='bold')

fps_fixed = 2
n_frames = int(video_duration * fps_fixed)
timestamps = np.arange(n_frames) / fps_fixed
# After 3D conv (temporal depth 2), pairs merge:
temporal_ids = np.arange(n_frames)  # Each frame gets sequential ID

ax.stem(timestamps, temporal_ids, linefmt='b-', markerfmt='bo', basefmt='gray', label='Temporal IDs')
for i, (t, tid) in enumerate(zip(timestamps, temporal_ids)):
    ax.annotate(f't_id={tid}', (t, tid), textcoords="offset points", xytext=(5, 5), fontsize=7)
ax.set_ylabel('Temporal Position ID', fontsize=10)
ax.set_xlabel('Wall-Clock Time (seconds)', fontsize=10)
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Bottom plot: Dynamic FPS with absolute time alignment
ax = axes[1]
ax.set_title('Dynamic FPS with Absolute Time Alignment — Variable ID Spacing', fontsize=11, fontweight='bold')

# Simulate dynamic sampling: higher FPS during action (2-5s), lower otherwise
timestamps_dynamic = np.array([0.0, 1.0, 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0, 4.25, 4.5, 4.75, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0])
# Temporal IDs proportional to time (at some base rate)
base_rate = 2  # IDs per second
temporal_ids_dynamic = (timestamps_dynamic * base_rate).astype(int)

# Color code by sampling density
colors_dynamic = ['#3498db' if t < 2 or t > 5 else '#e74c3c' for t in timestamps_dynamic]

markerline, stemlines, baseline = ax.stem(timestamps_dynamic, temporal_ids_dynamic,
                                           linefmt='gray', markerfmt='o', basefmt='gray')
markerline.set_color('#333')
# Color individual stem lines by sampling density
stemlines.set_color(colors_dynamic)

# Highlight regions
ax.axvspan(2, 5, alpha=0.1, color='red', label='High-action region (4 fps)')
ax.axvspan(0, 2, alpha=0.1, color='blue', label='Static region (1 fps)')
ax.axvspan(5, 10, alpha=0.1, color='blue')

ax.set_ylabel('Temporal Position ID', fontsize=10)
ax.set_xlabel('Wall-Clock Time (seconds)', fontsize=10)
ax.legend(fontsize=9, loc='upper left')
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Annotate the gap difference
ax.annotate('ID gap = 2\n(1 fps)', xy=(1.5, 3), fontsize=9, color='blue',
            ha='center', fontweight='bold')
ax.annotate('ID gap = 0-1\n(4 fps)', xy=(3.5, 7), fontsize=9, color='red',
            ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Visualization: M-RoPE dimension allocation

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

models = [
    ('Qwen2-VL\nmrope_section', [16, 24, 24], 'Standard M-RoPE'),
    ('Qwen2.5-VL\nmrope_section', [16, 24, 24], 'Standard M-RoPE'),
    ('Qwen3-VL\nmrope_section', [24, 20, 20], 'Interleaved M-RoPE'),
]

colors_rope = ['#e74c3c', '#2ecc71', '#3498db']  # temporal, height, width
labels_rope = ['Temporal', 'Height', 'Width']

for ax, (model_name, sections, style) in zip(axes, models):
    total = sum(sections)
    wedges, texts, autotexts = ax.pie(
        sections, labels=[f'{l}\n({s}/{total})' for l, s in zip(labels_rope, sections)],
        colors=colors_rope, autopct='%1.0f%%',
        startangle=90, pctdistance=0.65,
        textprops={'fontsize': 9}
    )
    for autotext in autotexts:
        autotext.set_fontweight('bold')
        autotext.set_fontsize(10)
    ax.set_title(f'{model_name}\n({style})', fontsize=10, fontweight='bold')

plt.suptitle('M-RoPE Dimension Allocation Across Qwen-VL Generations',
             fontsize=13, fontweight='bold', y=1.05)
plt.tight_layout()
plt.show()

# ============================================================================
# SOURCE CODE WALKTHROUGH: M-RoPE Position ID Generation
# From: qwen-vl-finetune/qwenvl/data/rope2d.py
# https://github.com/QwenLM/Qwen3-VL/tree/main/qwen-vl-finetune/qwenvl/data/rope2d.py
# ============================================================================

import numpy as np

# --- Qwen2.5-VL: get_rope_index_25() —— Absolute Time Encoding ---
#
# The docstring from the actual source explains the scheme clearly:
#
#   For pure text: temporal, height, width position IDs are all identical (= 1D RoPE).
#     input_ids:  [T  T  T  T  T]
#     temporal:   [0, 1, 2, 3, 4]
#     height:     [0, 1, 2, 3, 4]
#     width:      [0, 1, 2, 3, 4]
#
#   For video (3 temporal patches, 2 height, 2 width) followed by text:
#     input_ids:  [V  V  V  V  V  V  V  V  V  V  V  V  T  T  T  T  T]
#
#     ⭐ The temporal position IDs use ABSOLUTE TIME via `second_per_grid_ts`:
#     temporal:   [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
#                  ^frame 0       ^frame 1          ^frame 2
#     height:     [0, 0, 1, 1,  0,  0,  1,  1,   0,   0,   1,   1]
#     width:      [0, 1, 0, 1,  0,  1,  0,  1,   0,   1,   0,   1]
#
#   The interval=50 comes from: tokens_per_second × temporal_patch_size / fps
#   e.g., 25 tokens/s × 2 frames/patch / 1 fps = 50
#
#   Text tokens resume from the max visual position ID + 1:
#     temporal:   [..., 100, 100, 100, 100,  101, 102, 103, 104, 105]
#     height:     [...,   0,   0,   1,   1,  101, 102, 103, 104, 105]
#     width:      [...,   0,   1,   0,   1,  101, 102, 103, 104, 105]

# Here is the KEY inner loop from get_rope_index_25 (simplified, using numpy):
# Original source uses torch — we replicate the logic with numpy for illustration.

def compute_video_position_ids_25(t, h, w, second_per_grid_ts, spatial_merge_size=2):
    """
    Compute M-RoPE position IDs for a single video in Qwen2.5-VL style.

    Args:
        t, h, w: temporal, height, width grid dimensions (AFTER 3D conv + spatial merge)
        second_per_grid_ts: seconds per temporal grid step (from sample_fps)
        spatial_merge_size: spatial merge factor (default=2)

    Returns:
        Array of shape (3, t*h*w) — the (temporal, height, width) position IDs
    """
    llm_grid_t = t
    llm_grid_h = h // spatial_merge_size
    llm_grid_w = w // spatial_merge_size

    # ⭐ KEY: temporal IDs are spaced by real-time intervals
    # In the source:  t_index = torch.arange(llm_grid_t).view(-1,1).expand(-1, llm_grid_h*llm_grid_w).flatten()
    t_index = np.arange(llm_grid_t).reshape(-1, 1) * np.ones((1, llm_grid_h * llm_grid_w))
    t_index = (t_index.flatten() * second_per_grid_ts).astype(int)  # Scale by actual time interval!

    # Spatial IDs just count rows and columns (reset for each frame)
    # In the source:  h_index = torch.arange(llm_grid_h).view(1,-1,1).expand(llm_grid_t,-1,llm_grid_w).flatten()
    h_index = np.tile(np.arange(llm_grid_h).reshape(1, -1, 1) * np.ones((1, 1, llm_grid_w)),
                      (llm_grid_t, 1, 1)).flatten().astype(int)
    w_index = np.tile(np.ones((1, llm_grid_h, 1)) * np.arange(llm_grid_w).reshape(1, 1, -1),
                      (llm_grid_t, 1, 1)).flatten().astype(int)

    return np.stack([t_index, h_index, w_index])  # shape: (3, t*h*w)


# Demo: Qwen2.5-VL with a 10-second video at 2fps
fps = 2.0
temporal_patch_size = 2  # 3D conv merges pairs
video_duration = 10      # seconds
total_temporal_patches = int(video_duration * fps / temporal_patch_size)  # 10 * 2 / 2 = 10

# second_per_grid_ts = tokens_per_second * temporal_patch_size / sample_fps
# In Qwen2.5-VL, this defaults to around 1/fps in normalized form
# But the actual code computes it as a scaling factor
second_per_grid_ts = 1.0 / fps * temporal_patch_size  # = 1.0 second per grid step

h_grid, w_grid = 4, 4  # Small example: 4×4 spatial grid after merge
pos_ids = compute_video_position_ids_25(
    t=total_temporal_patches, h=h_grid * 2, w=w_grid * 2,  # pre-merge dimensions
    second_per_grid_ts=second_per_grid_ts * 25  # scaled by tokens_per_second=25
)

print("Qwen2.5-VL M-RoPE Position IDs (10s video, 2fps, 4×4 spatial grid)")
print("=" * 65)
print(f"  Video: {video_duration}s at {fps}fps → {total_temporal_patches} temporal patches")
print(f"  Spatial: {h_grid}×{w_grid} = {h_grid*w_grid} tokens per temporal patch")
print(f"  Total visual tokens: {total_temporal_patches * h_grid * w_grid}")
print()
print(f"  Temporal IDs (unique): {sorted(set(pos_ids[0].tolist()))}")
print(f"  Height IDs  (unique):  {sorted(set(pos_ids[1].tolist()))}")
print(f"  Width IDs   (unique):  {sorted(set(pos_ids[2].tolist()))}")
print(f"  Max position ID:       {pos_ids.max()}")
print(f"  If using 1D-RoPE:      max ID would be {total_temporal_patches * h_grid * w_grid - 1}")

# ============================================================================
# Qwen3-VL: get_rope_index_3() — Timestamp-Based Temporal Encoding
# ============================================================================
#
# KEY DIFFERENCE from Qwen2.5-VL (from the docstring):
#   "Different from the original implementation, Qwen3VL uses timestamps
#    rather than absolute time position ids."
#
# The big change: instead of assigning a SINGLE temporal ID block per frame,
# Qwen3-VL treats EACH FRAME as an independent image (t=1) and inserts
# timestamp tokens BETWEEN frames.
#
# Here is how the actual source code transforms the video data:
#
#   ⭐ STEP 1: Explode video_grid_thw so each frame stands alone
#   video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
#   video_grid_thw[:, 0] = 1   # Every frame is now a "single-frame video"
#
#   ⭐ STEP 2: For each frame, spatial IDs reset (just like an image):
#   t_index = [0, 0, 0, 0]   (all same — single frame)
#   h_index = [0, 0, 1, 1]   (row positions)
#   w_index = [0, 1, 0, 1]   (column positions)
#
#   ⭐ STEP 3: Timestamp tokens between frames get st_idx incremented
#   Each <|im_start|>, timestamp text, <|im_end|> between frames acts as
#   a 1D-RoPE segment (all 3 dims equal), naturally separating frames
#   in the temporal dimension.
#
# This means:
# - Frames don't need explicit temporal position IDs (spatial-only like images)
# - Temporal ordering comes from timestamp TEXT tokens between frames
# - The model learns temporal relationships from the timestamp values
# - It's more flexible: frame spacing can be irregular

def visualize_qwen3_position_ids(n_frames=3, h=2, w=2, timestamp_tokens=5):
    """
    Simulate the M-RoPE position IDs that get_rope_index_3() would produce
    for a short video with n_frames, each having h×w spatial tokens, and
    timestamp_tokens between each pair of frames.
    """
    all_temporal = []
    all_height = []
    all_width = []
    labels = []

    st_idx = 0  # Running position counter (from source: starts at 0 or continues from previous)

    for i in range(n_frames):
        # --- Frame i: treated as an independent image ---
        llm_grid_h = h
        llm_grid_w = w
        n_tokens = llm_grid_h * llm_grid_w

        # All temporal IDs are st_idx (single frame, t=1)
        t_ids = [st_idx] * n_tokens
        # Spatial IDs: standard row/col pattern
        h_ids = []
        w_ids = []
        for r in range(llm_grid_h):
            for c in range(llm_grid_w):
                h_ids.append(r)
                w_ids.append(c)

        all_temporal.extend(t_ids)
        all_height.extend(h_ids)
        all_width.extend(w_ids)
        labels.extend([f"F{i}"] * n_tokens)

        # ⭐ KEY: st_idx jumps to max+1 across all 3 dims
        max_id = max(max(t_ids), max(h_ids), max(w_ids))
        st_idx = max_id + 1

        # --- Timestamp tokens between frames (1D-RoPE: all dims equal) ---
        if i < n_frames - 1:
            for t in range(timestamp_tokens):
                all_temporal.append(st_idx)
                all_height.append(st_idx)
                all_width.append(st_idx)
                labels.append("TS")
                st_idx += 1

    return all_temporal, all_height, all_width, labels


temporal, height, width, labels = visualize_qwen3_position_ids(
    n_frames=3, h=2, w=2, timestamp_tokens=4
)

print("Qwen3-VL M-RoPE Position IDs (3 frames, 2×2 spatial, 4 timestamp tokens)")
print("=" * 72)
print()

# Print a nice table
header = f"{'Token':>6}  {'temporal':>8}  {'height':>8}  {'width':>8}"
print(header)
print("-" * len(header))

for i, (t, h, w, lab) in enumerate(zip(temporal, height, width, labels)):
    marker = "  ← timestamp" if lab == "TS" else f"  ← frame {lab}"
    print(f"{lab:>6}  {t:>8}  {h:>8}  {w:>8}{marker}")

print()
print("KEY OBSERVATIONS:")
print("  • Each frame's spatial IDs RESET (h: 0→1, w: 0→1) — just like an image")
print("  • Temporal IDs are FLAT within each frame (all same value)")
print("  • Timestamp tokens use 1D-RoPE (all 3 dims equal) — separates frames")
print("  • Frame temporal position comes from the TIMESTAMP TEXT, not from an index")
print()
# Compare max position IDs
print(f"  Max position ID used:  {max(max(temporal), max(height), max(width))}")
n_visual = sum(1 for l in labels if l != "TS")
n_ts = sum(1 for l in labels if l == "TS")
print(f"  Visual tokens: {n_visual},  Timestamp tokens: {n_ts},  Total: {len(labels)}")

# Visual comparison: Qwen2.5-VL vs Qwen3-VL temporal position ID patterns

import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# --- Left: Qwen2.5-VL style (block temporal IDs) ---
ax = axes[0]
n_frames_25, h_25, w_25 = 4, 2, 2
tokens_per_frame = h_25 * w_25
interval = 50  # second_per_grid_ts * tokens_per_second

temporal_25 = []
height_25 = []
width_25 = []
for f in range(n_frames_25):
    for r in range(h_25):
        for c in range(w_25):
            temporal_25.append(f * interval)
            height_25.append(r)
            width_25.append(c)

x_pos = np.arange(len(temporal_25))
ax.scatter(x_pos, temporal_25, c='#e74c3c', s=60, label='Temporal', zorder=3)
ax.scatter(x_pos, height_25, c='#2ecc71', s=40, marker='s', label='Height', zorder=3)
ax.scatter(x_pos, width_25, c='#3498db', s=40, marker='^', label='Width', zorder=3)

# Frame separators
for f in range(1, n_frames_25):
    ax.axvline(x=f * tokens_per_frame - 0.5, color='gray', linestyle='--', alpha=0.5)

ax.set_title("Qwen2.5-VL: Absolute Time Encoding", fontweight='bold', fontsize=11)
ax.set_xlabel("Token index")
ax.set_ylabel("Position ID")
ax.legend(fontsize=8, loc='upper left')
ax.set_ylim(-5, max(temporal_25) + 20)
# Label frames
for f in range(n_frames_25):
    ax.text(f * tokens_per_frame + tokens_per_frame/2 - 0.5, max(temporal_25) + 10,
            f"Frame {f}", ha='center', fontsize=8, color='gray')

# --- Right: Qwen3-VL style (per-frame reset + timestamps) ---
ax = axes[1]
temporal_3, height_3, width_3, labels_3 = visualize_qwen3_position_ids(
    n_frames=4, h=2, w=2, timestamp_tokens=3
)

x_pos = np.arange(len(temporal_3))
colors_t = ['#e74c3c' if l != 'TS' else '#e74c3c' for l in labels_3]
colors_h = ['#2ecc71' if l != 'TS' else '#2ecc71' for l in labels_3]
colors_w = ['#3498db' if l != 'TS' else '#3498db' for l in labels_3]
markers_t = ['o' if l != 'TS' else 'x' for l in labels_3]

# Visual tokens
vis_mask = np.array([l != 'TS' for l in labels_3])
ts_mask = ~vis_mask

ax.scatter(x_pos[vis_mask], np.array(temporal_3)[vis_mask], c='#e74c3c', s=60,
           label='Temporal (visual)', zorder=3)
ax.scatter(x_pos[vis_mask], np.array(height_3)[vis_mask], c='#2ecc71', s=40,
           marker='s', label='Height (visual)', zorder=3)
ax.scatter(x_pos[vis_mask], np.array(width_3)[vis_mask], c='#3498db', s=40,
           marker='^', label='Width (visual)', zorder=3)

# Timestamp tokens (all 3 dims same — shown as diamonds)
ax.scatter(x_pos[ts_mask], np.array(temporal_3)[ts_mask], c='#9b59b6', s=50,
           marker='D', label='Timestamp (all dims)', zorder=3)

# Frame/timestamp separators
prev_label = labels_3[0]
for i, l in enumerate(labels_3[1:], 1):
    if (prev_label != 'TS' and l == 'TS') or (prev_label == 'TS' and l != 'TS'):
        ax.axvline(x=i - 0.5, color='gray', linestyle='--', alpha=0.3)
    prev_label = l

ax.set_title("Qwen3-VL: Timestamp-Based Encoding", fontweight='bold', fontsize=11)
ax.set_xlabel("Token index")
ax.set_ylabel("Position ID")
ax.legend(fontsize=7, loc='upper left')

# Label frames
frame_idx = 0
i = 0
while i < len(labels_3):
    if labels_3[i] != 'TS':
        start = i
        while i < len(labels_3) and labels_3[i] != 'TS':
            i += 1
        mid = (start + i - 1) / 2
        ax.text(mid, max(temporal_3) + 1, f"F{frame_idx}", ha='center', fontsize=8, color='gray')
        frame_idx += 1
    else:
        start = i
        while i < len(labels_3) and labels_3[i] == 'TS':
            i += 1
        mid = (start + i - 1) / 2
        ax.text(mid, max(temporal_3) + 1, "TS", ha='center', fontsize=7, color='#9b59b6')

plt.suptitle("M-RoPE Temporal Position ID Patterns: Qwen2.5-VL vs Qwen3-VL",
             fontweight='bold', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

print("\nSummary of Architectural Differences:")
print("  Qwen2.5-VL: Large temporal ID gaps between frames (interval=50)")
print("              Encodes absolute wall-clock time directly in position IDs")
print("              Spatial IDs reset each frame, temporal IDs grow with time")
print()
print("  Qwen3-VL:   Each frame is an independent image (spatial IDs reset)")
print("              Timestamp TEXT tokens between frames carry temporal info")
print("              Position IDs grow smoothly; temporal meaning is in the content")
print("              More flexible — works naturally with variable frame rates")

# Visualization: Task effectiveness spectrum

tasks = [
    ('Video QA\n(short clips)', 0.92, '✅'),
    ('Video OCR &\nText Extraction', 0.90, '✅'),
    ('UI Agent\nScreen Recording', 0.85, '✅'),
    ('Temporal\nGrounding', 0.80, '✅'),
    ('Video\nSummarization', 0.78, '✅'),
    ('Long Video\n(1+ hr) QA', 0.70, '⚠️'),
    ('Real-time\nVideo Chat', 0.65, '⚠️'),
    ('Fine-grained\nAction Recognition', 0.55, '⚠️'),
    ('Precise Counting\n& Tracking', 0.40, '❌'),
    ('3D Spatial\nReasoning', 0.35, '❌'),
    ('Audio-Visual\nReasoning', 0.25, '❌'),
]

task_names = [t[0] for t in tasks]
scores = [t[1] for t in tasks]
status = [t[2] for t in tasks]

color_map = {'✅': '#2ecc71', '⚠️': '#f39c12', '❌': '#e74c3c'}
bar_colors = [color_map[s] for s in status]

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(range(len(tasks)), scores, color=bar_colors, edgecolor='white', linewidth=1.5, height=0.7)
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels(task_names, fontsize=9)
ax.set_xlabel('Relative VLM Effectiveness', fontsize=11)
ax.set_title('VLM Effectiveness Across Video Understanding Tasks', fontsize=13, fontweight='bold')
ax.set_xlim(0, 1.15)
ax.invert_yaxis()

# Add status labels
for i, (score, s) in enumerate(zip(scores, status)):
    ax.text(score + 0.02, i, f'{s} {score:.0%}', va='center', fontsize=9, fontweight='bold')

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#2ecc71', label='Strong performance'),
    Patch(facecolor='#f39c12', label='Improving / Mixed'),
    Patch(facecolor='#e74c3c', label='Significant challenges'),
]
ax.legend(handles=legend_elements, loc='lower right', fontsize=9)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

# Visualization: The information loss from frame sampling

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Information retention vs sampling rate
ax = axes[0]
original_fps = 30
sample_rates = np.linspace(0.5, 30, 100)
retention = sample_rates / original_fps * 100

ax.fill_between(sample_rates, retention, 100, alpha=0.15, color='red', label='Information lost')
ax.fill_between(sample_rates, 0, retention, alpha=0.15, color='green', label='Information retained')
ax.plot(sample_rates, retention, 'k-', linewidth=2)

# Mark common operating points
for fps, label, color in [(1, '1 fps', '#e74c3c'), (2, '2 fps (default)', '#f39c12'),
                           (4, '4 fps', '#2ecc71'), (8, '8 fps', '#3498db')]:
    ret = fps / original_fps * 100
    ax.plot(fps, ret, 'o', color=color, markersize=10, zorder=5)
    ax.annotate(f'{label}\n({ret:.1f}%)', xy=(fps, ret), xytext=(fps + 2, ret + 5),
                fontsize=8, fontweight='bold', color=color,
                arrowprops=dict(arrowstyle='->', color=color, lw=1.5))

ax.set_xlabel('Sampling Rate (fps)', fontsize=11)
ax.set_ylabel('Frame Retention (%)', fontsize=11)
ax.set_title('Frame Information Retention vs Sampling Rate\n(original: 30 fps)', fontsize=11, fontweight='bold')
ax.legend(fontsize=9, loc='center right')
ax.set_ylim(0, 105)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(True, alpha=0.3)

# Right: Compute cost scaling
ax = axes[1]
n_tokens = np.linspace(100, 100000, 200)
attention_cost = n_tokens ** 2

# Mark video lengths
durations = [10, 60, 300, 3600]  # seconds
duration_labels = ['10s', '1min', '5min', '1hr']
fps = 2
tokens_per_frame = 128  # after merging

for dur, label in zip(durations, duration_labels):
    n_tok = dur * fps * tokens_per_frame / 2  # /2 for 3D conv
    if n_tok <= n_tokens[-1]:
        cost = n_tok ** 2
        ax.axvline(x=n_tok, color='gray', linestyle=':', alpha=0.5)
        ax.text(n_tok, ax.get_ylim()[1] if ax.get_ylim()[1] > 0 else 1e9, f' {label}\n({n_tok:.0f} tok)',
                fontsize=8, rotation=0, va='top')

ax.plot(n_tokens, attention_cost, 'r-', linewidth=2, label='$O(n^2)$ attention')
ax.set_xlabel('Total Visual Tokens', fontsize=11)
ax.set_ylabel('Relative Attention Cost', fontsize=11)
ax.set_title('Attention Cost Scaling with Video Length\n(2 fps, 448×448)', fontsize=11, fontweight='bold')
ax.set_yscale('log')
ax.set_xscale('log')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(True, alpha=0.3)
ax.legend(fontsize=10)

plt.tight_layout()
plt.show()

Challenge	Description
Temporal Complexity	Events unfold over time — the model must relate frames that may be seconds or minutes apart
Massive Data Volume	A 30fps, 1080p video generates ~2M pixels per frame × 30 frames/sec = ~60M pixels/sec
Redundancy	Adjacent frames are highly similar — naive processing wastes compute on near-duplicate information
Long-Range Dependencies	Understanding a plot twist at minute 45 may require context from minute 5
Multi-Modal Reasoning	Audio, text overlays, scene changes, and visual content all carry meaning

Generation	Release	Key Innovations
Qwen-VL	Aug 2023	First VL model in the Qwen family; fixed resolution
Qwen2-VL	Sep 2024	Naive Dynamic Resolution, M-RoPE, unified image/video processing, 3D convolutions
Qwen2.5-VL	Jan 2025	Dynamic FPS training, absolute time encoding, window attention in ViT, SwiGLU/RMSNorm
Qwen3-VL	Sep 2025	Interleaved-MRoPE, DeepStack multi-level ViT fusion, Text-Timestamp Alignment, 256K native context

Model	ViT Params	LLM Params	Total
Qwen2.5-VL-3B	~675M	3B	~3.7B
Qwen2.5-VL-7B	~675M	7.6B	~8.3B
Qwen2.5-VL-72B	~675M	72B	~72.7B
Qwen3-VL-8B	—	8B	~8B
Qwen3-VL-32B	—	32B	~32B
Qwen3-VL-235B-A22B	—	235B (22B active, MoE)	~235B

Image Resolution	Patches (14×14)	After 2×2 Merge	+ Delimiters
224 × 224	16 × 16 = 256	64	66
448 × 448	32 × 32 = 1,024	256	258
896 × 896	64 × 64 = 4,096	1,024	1,026
1344 × 896	96 × 64 = 6,144	1,536	1,538

Encoding	Max Position ID	Tokens
1D-RoPE	6,400	100 × 64 = 6,400
M-RoPE	max(100, 8, 8) = 100	100 × 64 = 6,400

Function	Model	Key Difference
`get_rope_index_2()`	Qwen2-VL	Frame-index-based temporal IDs
`get_rope_index_25()`	Qwen2.5-VL	Absolute time–based temporal IDs (`second_per_grid_ts`)
`get_rope_index_3()`	Qwen3-VL	Timestamps rather than absolute time position IDs; each frame split to `t=1`

Model	MVBench	PerceptionTest	Video-MME (w/o subs)
GPT-4o	—	—	71.9
Qwen2-VL-72B	73.6	68.0	71.2
Qwen2.5-VL-7B	69.6	70.5	65.1

Dimension	Current State	Key Bottleneck
Short video QA	✅ Strong	Benchmark saturation
Video OCR	✅ Strong	Frame resolution
Temporal localization	✅ Good (improving)	Absolute time alignment
Long video comprehension	⚠️ Improving	Token budget, context length
Fine-grained temporal reasoning	⚠️ Mixed	Frame sampling, attention limits
Precise counting & tracking	❌ Weak	Not designed for continuous tracking
Audio-visual understanding	❌ Missing	No audio modality
Real-time processing	❌ Limited	Inference latency
3D spatial reasoning from video	⚠️ Emerging	Qwen3-VL introduces 3D grounding

Video Understanding with Vision-Language Models¶

A Deep Dive into Qwen-VL Architecture & Video Processing¶

tinyurl.com/video-understanding¶

Table of Contents¶

1. Introduction: Video Understanding as an ML Task ¶

Why is Video Understanding Hard?¶

The VLM Approach¶

2. The Qwen-VL Model Family ¶

Architecture Overview¶

Model Sizes¶

3. Video Sampling & Frame Extraction ¶

3.1 Uniform Temporal Sampling (Fixed FPS)¶

3.2 Dynamic FPS Sampling (Qwen2.5-VL & Qwen3-VL)¶

3.3 Token Budget Management¶

3.4 Where Frame Sampling Happens in Code¶

4. From Frames to Patches: The Vision Encoder Pipeline ¶

4.1 Patch Extraction¶

4.2 3D Convolution for Video (Temporal Merging)¶

4.3 Spatial Token Merging (2×2 Compression)¶

4.4 Dynamic Resolution¶

5. Multimodal Rotary Position Embedding (M-RoPE) ¶

5.1 The Problem with 1D Positional Encodings¶

5.2 M-RoPE: Decomposing Position into Three Components¶

5.3 How Position IDs Are Assigned¶

For Text tokens:¶

For Image tokens:¶

For Video tokens:¶

Cross-modality transitions:¶

5.4 Key Insight: Position ID Compression¶

6. Temporal Embeddings: How Time Patches Are Created ¶

6.1 From Absolute Time to Temporal Position IDs¶

How it works:¶

6.2 The Complete Temporal Patch Pipeline¶

6.3 Qwen3-VL: Interleaved M-RoPE and Text-Timestamp Alignment¶

6.4 Where Temporal Embeddings Are Created in Code¶

7. Effective Uses of VLMs for Video Understanding ¶

7.1 ✅ Video Question Answering (VideoQA)¶

7.2 ✅ Temporal Event Localization & Grounding¶

7.3 ✅ Video Summarization & Content Extraction¶

7.4 ✅ Video OCR & Text-in-Video Understanding¶

7.5 ✅ Long Video Comprehension (1+ Hours)¶

7.6 ✅ Visual Agent Tasks on Screen Recordings¶

8. Challenges & Limitations ¶

8.1 🔴 Frame Sampling Creates an Information Bottleneck¶

8.2 🔴 Quadratic Attention Scaling¶

8.3 🔴 Temporal Reasoning Is Shallow¶

8.4 🔴 No Native Audio Understanding¶

8.5 🔴 Hallucination & Confabulation¶

8.6 🟡 Resolution vs Throughput Trade-off¶

8.7 🟡 Evaluation Challenges¶

Summary: VLM Strengths vs Challenges for Video Understanding¶

Looking Forward¶

9. References ¶

Primary Papers¶

Foundational Work¶

Benchmarks¶

Resources¶