<?xml version="1.0" encoding="utf-8" standalone="no"?>
<dublin_core schema="dc">
<dcvalue element="contributor" qualifier="author">Choi,&#x20;Tae-Min</dcvalue>
<dcvalue element="contributor" qualifier="author">Yoon,&#x20;Inug</dcvalue>
<dcvalue element="contributor" qualifier="author">Kim,&#x20;Jong-Hwan</dcvalue>
<dcvalue element="contributor" qualifier="author">Park,&#x20;Ju&#x20;youn</dcvalue>
<dcvalue element="date" qualifier="accessioned">2024-12-12T08:30:06Z</dcvalue>
<dcvalue element="date" qualifier="available">2024-12-12T08:30:06Z</dcvalue>
<dcvalue element="date" qualifier="created">2024-12-11</dcvalue>
<dcvalue element="date" qualifier="issued">2024-11-25</dcvalue>
<dcvalue element="identifier" qualifier="uri">https:&#x2F;&#x2F;pubs.kist.re.kr&#x2F;handle&#x2F;201004&#x2F;151350</dcvalue>
<dcvalue element="identifier" qualifier="uri">https:&#x2F;&#x2F;bmvc2024.org&#x2F;proceedings&#x2F;85&#x2F;</dcvalue>
<dcvalue element="description" qualifier="abstract">Open-vocabulary&#x20;object&#x20;detection&#x20;(OVD)&#x20;is&#x20;a&#x20;computer&#x20;vision&#x20;task&#x20;that&#x20;detects&#x20;and&#x20;classifies&#x20;objects&#x20;from&#x20;categories&#x20;not&#x20;seen&#x20;during&#x20;training.&#x20;While&#x20;recent&#x20;OVD&#x20;methods&#x20;primarily&#x20;focus&#x20;on&#x20;aligning&#x20;region&#x20;embeddings&#x20;with&#x20;visual-language&#x20;pre-trained&#x20;models&#x20;like&#x20;CLIP&#x20;for&#x20;classification,&#x20;object&#x20;detection&#x20;requires&#x20;effective&#x20;localization&#x20;as&#x20;well.&#x20;However,&#x20;existing&#x20;methods&#x20;often&#x20;use&#x20;a&#x20;proposal&#x20;generator&#x20;biased&#x20;toward&#x20;the&#x20;training&#x20;data,&#x20;which&#x20;creates&#x20;a&#x20;bottleneck&#x20;in&#x20;performance&#x20;improvement.&#x20;To&#x20;address&#x20;this&#x20;challenge,&#x20;we&#x20;introduce&#x20;the&#x20;Textual&#x20;Attention&#x20;Region&#x20;Proposal&#x20;Network&#x20;(TA-RPN).&#x20;This&#x20;network&#x20;enhances&#x20;proposal&#x20;generation&#x20;by&#x20;integrating&#x20;visual&#x20;and&#x20;textual&#x20;features&#x20;from&#x20;the&#x20;CLIP&#x20;text&#x20;encoder,&#x20;utilizing&#x20;pixel-wise&#x20;attention&#x20;for&#x20;a&#x20;comprehensive&#x20;fusion&#x20;across&#x20;the&#x20;image&#x20;space.&#x20;Our&#x20;approach&#x20;also&#x20;incorporates&#x20;prompt&#x20;learning&#x20;to&#x20;optimize&#x20;textual&#x20;features&#x20;for&#x20;better&#x20;localization.&#x20;Evaluated&#x20;on&#x20;the&#x20;COCO&#x20;and&#x20;LVIS&#x20;benchmarks,&#x20;TA-RPN&#x20;outperforms&#x20;existing&#x20;state-of-the-art&#x20;methods,&#x20;demonstrating&#x20;its&#x20;effectiveness&#x20;in&#x20;detecting&#x20;novel&#x20;object&#x20;categories.</dcvalue>
<dcvalue element="language" qualifier="none">English</dcvalue>
<dcvalue element="publisher" qualifier="none">The&#x20;British&#x20;Machine&#x20;Vision&#x20;Association&#x20;and&#x20;Society&#x20;for&#x20;Pattern&#x20;Recognition</dcvalue>
<dcvalue element="title" qualifier="none">Textual&#x20;Attention&#x20;RPN&#x20;for&#x20;Open-Vocabulary&#x20;Object&#x20;Detection</dcvalue>
<dcvalue element="type" qualifier="none">Conference</dcvalue>
<dcvalue element="description" qualifier="journalClass">1</dcvalue>
<dcvalue element="identifier" qualifier="bibliographicCitation">The&#x20;35th&#x20;British&#x20;Machine&#x20;Vision&#x20;Conference</dcvalue>
<dcvalue element="citation" qualifier="title">The&#x20;35th&#x20;British&#x20;Machine&#x20;Vision&#x20;Conference</dcvalue>
<dcvalue element="citation" qualifier="conferencePlace">UK</dcvalue>
<dcvalue element="citation" qualifier="conferencePlace">Glasgow,&#x20;UK</dcvalue>
<dcvalue element="citation" qualifier="conferenceDate">2024-11-25</dcvalue>
<dcvalue element="relation" qualifier="isPartOf">The&#x20;35th&#x20;British&#x20;Machine&#x20;Vision&#x20;Conference</dcvalue>
</dublin_core>
