import os

# Load API keys from a .env file if present (install with: pip install python-dotenv)
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass  # No .env file support — set keys as environment variables instead

# --- API keys (edit these if you are not using a .env file) ---
DASHSCOPE_API_KEY    = os.environ.get("DASHSCOPE_API_KEY", "")
SERPAPI_IMAGE_SEARCH_KEY = os.environ.get("SERPAPI_IMAGE_SEARCH_KEY", "")
REPLICATE_API_TOKEN  = os.environ.get("REPLICATE_API_TOKEN", "")

missing = [k for k, v in {
    "DASHSCOPE_API_KEY": DASHSCOPE_API_KEY,
    "SERPAPI_IMAGE_SEARCH_KEY": SERPAPI_IMAGE_SEARCH_KEY,
    "REPLICATE_API_TOKEN": REPLICATE_API_TOKEN,
}.items() if not v]
if missing:
    print(f"⚠  Missing keys: {', '.join(missing)}. Add them to a .env file or set as environment variables.")

nyt_image = "https://static01.nyt.com/images/2026/02/23/multimedia/23dc-caine-tlgw/23dc-caine-tlgw-threeByTwoSmallAt2X.jpg?format=pjpg&quality=75&auto=webp&disable=upscale"

%pip install -q dashscope

from IPython.display import Markdown, display

import dashscope
from dashscope import MultiModalConversation

api_key = DASHSCOPE_API_KEY  # set in the setup cell above

# The following is the base_url for the Singapore region. If you use a model in the Virginia region, change the base_url to https://dashscope-us.aliyuncs.com/api/v1
# If you use a model in the China (Beijing) region, replace the base_url with: https://dashscope.aliyuncs.com/api/v1
dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"

enable_thinking=True

messages = [
    {
        "role": "user",
        "content": [
            {"image": nyt_image},
            {"text": "Geocode this image. Provide a detailed account of your reasoning and as exact a location as possible."}
        ]
    }
]

response = MultiModalConversation.call(
    # If you have not configured the environment variable, replace the following line with your Model Studio API key: api_key="sk-xxx",
    # The API keys for different regions are different. To obtain an API key, see https://www.alibabacloud.com/help/en/model-studio/get-api-key
    api_key=api_key,
    model="qwen3-vl-plus",
    messages=messages,
    stream=True,
    # The enable_thinking parameter enables the thinking process.
    # For qwen3-vl-plus and qwen3-vl-flash, thinking can be enabled or disabled with enable_thinking. For models with the 'thinking' suffix, such as qwen3-vl-235b-a22b-thinking, enable_thinking can only be set to true. This does not apply to other Qwen-VL models.
    enable_thinking=enable_thinking,
    # The thinking_budget parameter sets the maximum number of tokens for the reasoning process.
    thinking_budget=81920,

)

# Define the complete thinking process
reasoning_content = ""
# Define the complete response
answer_content = ""
# Determine whether to end the thinking process and start responding
is_answering = False

if enable_thinking:
    print("=" * 20 + "Thinking Process" + "=" * 20)

for chunk in response:
    # If both the thinking process and the response are empty, ignore
    message = chunk.output.choices[0].message
    reasoning_content_chunk = message.get("reasoning_content", None)
    if (chunk.output.choices[0].message.content == [] and
        reasoning_content_chunk == ""):
        pass
    else:
        # If it is currently in the thinking process
        if reasoning_content_chunk is not None and chunk.output.choices[0].message.content == []:
            print(chunk.output.choices[0].message.reasoning_content, end="")
            reasoning_content += chunk.output.choices[0].message.reasoning_content
        # If it is currently responding
        elif chunk.output.choices[0].message.content != []:
            if not is_answering:
                print("\n" + "=" * 20 + "Complete Response" + "=" * 20)
                is_answering = True
            print(chunk.output.choices[0].message.content[0]["text"], end="")
            answer_content += chunk.output.choices[0].message.content[0]["text"]

# To print the complete thinking process and complete response, uncomment and run the following code
print("=" * 20 + "Complete Thinking Process" + "=" * 20 + "\n")
display(Markdown(reasoning_content))
print("=" * 20 + "Complete Response" + "=" * 20 + "\n")
display(Markdown(answer_content))

%pip install -q "qwen-agent"

import os

serp_api_key = SERPAPI_IMAGE_SEARCH_KEY  # set in the setup cell above
os.environ["SERPAPI_IMAGE_SEARCH_KEY"] = serp_api_key

import qwen_agent.tools.image_search as img_search_module
img_search_module.SERPAPI_IMAGE_SEARCH_KEY = serp_api_key

from qwen_agent.agents import Assistant
from qwen_agent.utils.output_beautify import typewriter_print, multimodal_typewriter_print

model = "qwen3-vl-plus" #or "qwen3-vl-flash" (faster and cheaper), "qwen-vl-ocr" (for typewritten texts) For a full list of available models, see https://www.alibabacloud.com/help/en/model-studio/models

llm_cfg = {

    # Use a model service compatible with the OpenAI API, such as vLLM or Ollama:
    'model_type': 'qwenvl_oai',
    'model': model,
    'model_server': 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',  # base_url, also known as api_base
    'api_key': api_key,
    'generate_cfg': {
        "top_p": 0.8,
        "top_k": 20,
        "temperature": 0.7,
        "repetition_penalty": 1.0,
        "presence_penalty": 1.5
    }
}

analysis_prompt = """Your role is that of a research assistant specializing in visual information. Answer questions about images by looking at them closely and then using research tools. Please follow this structured thinking process and show your work.

Start an iterative loop for each question:

- **First, look closely:** Begin with a detailed description of the image, paying attention to the user's question. List what you can tell just by looking, and what you'll need to look up.
- **Next, find information:** Use a tool to research the things you need to find out.
- **Then, review the findings:** Carefully analyze what the tool tells you and decide on your next action.

Continue this loop until your research is complete.

To finish, bring everything together in a clear, synthesized answer that fully responds to the user's question."""

tools = [
    'image_zoom_in_tool', #https://github.com/QwenLM/Qwen-Agent/blob/main/qwen_agent/tools/image_zoom_in_qwen3vl.py
    'image_search' #https://github.com/QwenLM/Qwen-Agent/blob/main/qwen_agent/tools/image_search.py
]
agent = Assistant(
    llm=llm_cfg,
    function_list=tools,
    system_message=analysis_prompt,
    # [!Optional] We provide `analysis_prompt` to enable VL conduct deep analysis. Otherwise use system_message='' to simply enable the tools.
)

messages = []
messages += [
    {"role": "user", "content": [
        {"image": nyt_image},
        {"text": "Where was the picture taken?"}
    ]}
]

response_plain_text = ''
for ret_messages in agent.run(messages):
    # `ret_messages` will contain all subsequent messages, consisting of interleaved assistant messages and tool responses
    response_plain_text = multimodal_typewriter_print(ret_messages, response_plain_text)

%pip install -q replicate

# https://dpul.princeton.edu/igor-savchenko/catalog/fcbdbe3d-8a5e-4ecb-b362-b8691c56d12b

!wget https://iiif-cloud.princeton.edu/iiif/2/a2%2Fb3%2Fa4%2Fa2b3a4ea28f44effad3b119855148816%2Fintermediate_file/full/800,/0/default.jpg -O savchenko.jpg

from IPython.display import Image
Image(filename='savchenko.jpg')

import base64
from pathlib import Path

# many libraries, archives and museums limit bot traffic. To address this, we'll download the image first and encode as a data url.

encoded_string = base64.b64encode(Path("savchenko.jpg").read_bytes()).decode('utf-8')
mime_type = 'image/jpeg'
encoded_image = f"data:{mime_type};base64,{encoded_string}"
encoded_image[:200]

import replicate
from replicate.client import Client

replicate_client = Client(
  api_token=REPLICATE_API_TOKEN,  # set in the setup cell above
  headers={
    "User-Agent": "replicate-jupyter-example/1.0"
  }
)
output = replicate_client.run(
    "flux-kontext-apps/restore-image",
    input={
        "input_image": encoded_image,
        "output_format": "jpg",
        "safety_tolerance": 2
    }
)

print(output.url)

import requests
from PIL import Image
from io import BytesIO
from IPython.display import display

image_url = output.url
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))

display(img)

import replicate
from replicate.client import Client

from typing import Dict, List, Optional, OrderedDict, Tuple, Union

from qwen_agent.tools.base import BaseTool, register_tool
from qwen_agent.log import logger
from qwen_agent.llm.schema import Message, ContentItem
from qwen_agent.utils.utils import extract_images_from_messages



@register_tool('restore_image', allow_overwrite=True)
class ImageRestoration(BaseTool):
    name = 'restore_image'
    description = 'Image restoration tool. Input low-resolution or blurry image that needs clarity or restoration.'
    parameters = {
        'type': 'object',
        'properties': {
            'img_idx': {
                'type': 'number',
                'description': 'The index of the image (starting from 0)'
            }
        },
        'required': ['img_idx']
    }

    def call(self, params: Union[str, dict], **kwargs) -> str:
        params = self._verify_json_format_args(params)
        image_id = int(params['img_idx'])
        images =  extract_images_from_messages(kwargs.get('messages', []))
        if not images:
            return 'Error: no images found in the messages.'
        if image_id >= len(images):
            image_id = len(images) - 1

        image_url = images[image_id]
        try:
            encoded_string = base64.b64encode(Path(image_url).read_bytes()).decode('utf-8')
            mime_type = 'image/jpeg'
            encoded_image = f"data:{mime_type};base64,{encoded_string}"
            replicate_client = Client(
              api_token=REPLICATE_API_TOKEN,  # set in the setup cell above
              headers={
                "User-Agent": "replicate-jupyter-example/1.0"
              }
            )
            output = replicate_client.run(
              "flux-kontext-apps/restore-image",
              input={
                  "input_image": encoded_image,
                  "output_format": "jpg",
                  "safety_tolerance": 2
              }
            )
            content = [output.url]
            return content

        except Exception as e:
            logger.info(f'Exception in ImageRestoration.call: {repr(e)}')
            content = []
        return content

model = "qwen3-vl-plus" #or "qwen3-vl-flash" (faster and cheaper), "qwen-vl-ocr" (for typewritten texts) For a full list of available models, see https://www.alibabacloud.com/help/en/model-studio/models

llm_cfg = {

    # Use a model service compatible with the OpenAI API, such as vLLM or Ollama:
    'model_type': 'qwenvl_oai',
    'model': model,
    'model_server': 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',  # base_url, also known as api_base
    'api_key': api_key,
    'generate_cfg': {
        "top_p": 0.8,
        "top_k": 20,
        "temperature": 0.7,
        "repetition_penalty": 1.0,
        "presence_penalty": 1.5
    }
}

analysis_prompt = """Your role is that of a research assistant specializing in visual information. Answer questions about images by looking at them closely and then using research tools. Please follow this structured thinking process and show your work.

Start an iterative loop for each question:

- **First, look closely:** Begin with a detailed description of the image, paying attention to the user's question. List what you can tell just by looking, and what you'll need to look up.
- **Next, find information:** Use a tool to research the things you need to find out.
- **Then, review the findings:** Carefully analyze what the tool tells you and decide on your next action.

Continue this loop until your research is complete.

To finish, bring everything together in a clear, synthesized answer that fully responds to the user's question."""

tools = [
    'restore_image',
    'image_zoom_in_tool',
    'image_search'
]
agent = Assistant(
    llm=llm_cfg,
    function_list=tools,
    system_message=analysis_prompt,
    # [!Optional] We provide `analysis_prompt` to enable VL conduct deep analysis. Otherwise use system_message='' to simply enable the tools.
)

messages = []
messages += [
    {"role": "user", "content": [
        {"image": 'savchenko.jpg'},
        {"text": "Create metatadata for this image for an academic art museum."}
    ]}
]

response_plain_text = ''
for ret_messages in agent.run(messages):
    # `ret_messages` will contain all subsequent messages, consisting of interleaved assistant messages and tool responses
    response_plain_text = multimodal_typewriter_print(ret_messages, response_plain_text)

Visual Tool Calling¶