123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- from typing import Any, Dict, List
- from langchain_core.messages import AnyMessage, AIMessage, HumanMessage
- def get_research_topic(messages: List[AnyMessage]) -> str:
- """
- Get the research topic from the messages.
- """
- # check if request has a history and combine the messages into a single string
- if len(messages) == 1:
- research_topic = messages[-1].content
- else:
- research_topic = ""
- for message in messages:
- if isinstance(message, HumanMessage):
- research_topic += f"User: {message.content}\n"
- elif isinstance(message, AIMessage):
- research_topic += f"Assistant: {message.content}\n"
- return research_topic
- def resolve_urls(urls_to_resolve: List[Any], id: int) -> Dict[str, str]:
- """
- Create a map of the vertex ai search urls (very long) to a short url with a unique id for each url.
- Ensures each original URL gets a consistent shortened form while maintaining uniqueness.
- """
- prefix = f"https://vertexaisearch.cloud.google.com/id/"
- urls = [site.web.uri for site in urls_to_resolve]
- # Create a dictionary that maps each unique URL to its first occurrence index
- resolved_map = {}
- for idx, url in enumerate(urls):
- if url not in resolved_map:
- resolved_map[url] = f"{prefix}{id}-{idx}"
- return resolved_map
- def insert_citation_markers(text, citations_list):
- """
- Inserts citation markers into a text string based on start and end indices.
- Args:
- text (str): The original text string.
- citations_list (list): A list of dictionaries, where each dictionary
- contains 'start_index', 'end_index', and
- 'segment_string' (the marker to insert).
- Indices are assumed to be for the original text.
- Returns:
- str: The text with citation markers inserted.
- """
- # Sort citations by end_index in descending order.
- # If end_index is the same, secondary sort by start_index descending.
- # This ensures that insertions at the end of the string don't affect
- # the indices of earlier parts of the string that still need to be processed.
- sorted_citations = sorted(
- citations_list, key=lambda c: (c["end_index"], c["start_index"]), reverse=True
- )
- modified_text = text
- for citation_info in sorted_citations:
- # These indices refer to positions in the *original* text,
- # but since we iterate from the end, they remain valid for insertion
- # relative to the parts of the string already processed.
- end_idx = citation_info["end_index"]
- marker_to_insert = ""
- for segment in citation_info["segments"]:
- marker_to_insert += f" [{segment['label']}]({segment['short_url']})"
- # Insert the citation marker at the original end_idx position
- modified_text = (
- modified_text[:end_idx] + marker_to_insert + modified_text[end_idx:]
- )
- return modified_text
- def get_citations(response, resolved_urls_map):
- """
- Extracts and formats citation information from a Gemini model's response.
- This function processes the grounding metadata provided in the response to
- construct a list of citation objects. Each citation object includes the
- start and end indices of the text segment it refers to, and a string
- containing formatted markdown links to the supporting web chunks.
- Args:
- response: The response object from the Gemini model, expected to have
- a structure including `candidates[0].grounding_metadata`.
- It also relies on a `resolved_map` being available in its
- scope to map chunk URIs to resolved URLs.
- Returns:
- list: A list of dictionaries, where each dictionary represents a citation
- and has the following keys:
- - "start_index" (int): The starting character index of the cited
- segment in the original text. Defaults to 0
- if not specified.
- - "end_index" (int): The character index immediately after the
- end of the cited segment (exclusive).
- - "segments" (list[str]): A list of individual markdown-formatted
- links for each grounding chunk.
- - "segment_string" (str): A concatenated string of all markdown-
- formatted links for the citation.
- Returns an empty list if no valid candidates or grounding supports
- are found, or if essential data is missing.
- """
- citations = []
- # Ensure response and necessary nested structures are present
- if not response or not response.candidates:
- return citations
- candidate = response.candidates[0]
- if (
- not hasattr(candidate, "grounding_metadata")
- or not candidate.grounding_metadata
- or not hasattr(candidate.grounding_metadata, "grounding_supports")
- ):
- return citations
- for support in candidate.grounding_metadata.grounding_supports:
- citation = {}
- # Ensure segment information is present
- if not hasattr(support, "segment") or support.segment is None:
- continue # Skip this support if segment info is missing
- start_index = (
- support.segment.start_index
- if support.segment.start_index is not None
- else 0
- )
- # Ensure end_index is present to form a valid segment
- if support.segment.end_index is None:
- continue # Skip if end_index is missing, as it's crucial
- # Add 1 to end_index to make it an exclusive end for slicing/range purposes
- # (assuming the API provides an inclusive end_index)
- citation["start_index"] = start_index
- citation["end_index"] = support.segment.end_index
- citation["segments"] = []
- if (
- hasattr(support, "grounding_chunk_indices")
- and support.grounding_chunk_indices
- ):
- for ind in support.grounding_chunk_indices:
- try:
- chunk = candidate.grounding_metadata.grounding_chunks[ind]
- resolved_url = resolved_urls_map.get(chunk.web.uri, None)
- citation["segments"].append(
- {
- "label": chunk.web.title.split(".")[:-1][0],
- "short_url": resolved_url,
- "value": chunk.web.uri,
- }
- )
- except (IndexError, AttributeError, NameError):
- # Handle cases where chunk, web, uri, or resolved_map might be problematic
- # For simplicity, we'll just skip adding this particular segment link
- # In a production system, you might want to log this.
- pass
- citations.append(citation)
- return citations
|