diff --git a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt index 7177d41..0c3af80 100644 --- a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt +++ b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt @@ -243,6 +243,10 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { true // Asynchronous } is Command.TakeScreenshot -> executeTakeScreenshotCommand() + is Command.Completed -> { + Log.d(TAG, "Command.Completed: No accessibility action required.") + true + } is Command.Wait -> { pendingScreenshotDelayMillis = command.seconds .coerceAtLeast(0L) @@ -420,7 +424,7 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { } } }.also { - if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand) { + if (command !is Command.TakeScreenshot && command !is Command.TermuxCommand && command !is Command.Completed) { sawNonTermuxCommandSinceLastScreenshot = true } } diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt index 1ae5fd2..9f3685b 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningCommandProcessing.kt @@ -6,6 +6,7 @@ import com.google.ai.sample.util.CommandParser internal data class ParsedCommandBatch( val commands: List, val hasTakeScreenshotCommand: Boolean, + val hasCompletedCommand: Boolean, val commandDescriptions: String ) @@ -19,6 +20,7 @@ internal object PhotoReasoningCommandProcessing { return ParsedCommandBatch( commands = commands, hasTakeScreenshotCommand = commands.any { it is Command.TakeScreenshot }, + hasCompletedCommand = commands.any { it is Command.Completed }, commandDescriptions = commands.joinToString("; ") { it.toString() } ) } diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt index 92864c0..211e4fa 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt @@ -392,6 +392,7 @@ fun PhotoReasoningScreen( is Command.ClickButton -> "Click on button: \"${command.buttonText}\"" is Command.TapCoordinates -> "Tap coordinates: (${command.x}, ${command.y})" is Command.TakeScreenshot -> "Take screenshot" + is Command.Completed -> "Completed" is Command.Wait -> "Wait: ${command.seconds} seconds" is Command.Retrieve -> "Retrieve: \"${command.heading}\"" else -> command::class.simpleName ?: "Unknown Command" diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt index c3b4b05..c0be47d 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt @@ -2235,12 +2235,17 @@ class PhotoReasoningViewModel( for (command in newCommands) { if (stopExecutionFlag.get()) break - // Skip takeScreenshot during streaming - it will be handled by final processCommands + // Skip commands that are handled only after streaming has finished. if (command is Command.TakeScreenshot) { Log.d(TAG, "Incremental: Skipping takeScreenshot during streaming (will be handled at end)") incrementalCommandCount++ continue } + if (command is Command.Completed) { + Log.d(TAG, "Incremental: completed() received; stopping incremental command execution") + incrementalCommandCount++ + break + } try { Log.d(TAG, "Incremental: Executing command: $command") @@ -2267,17 +2272,29 @@ private fun processCommands(text: String) { if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) return@launch // Check for cancellation try { val commandBatch = PhotoReasoningCommandProcessing.parseForFinalExecution(text) - val commands = commandBatch.commands + val parsedCommands = commandBatch.commands + val hasCompletedCommand = commandBatch.hasCompletedCommand val hasTakeScreenshotCommand = commandBatch.hasTakeScreenshotCommand + val commandsBeforeCompletion = if (hasCompletedCommand) { + parsedCommands.takeWhile { it !is Command.Completed } + Command.Completed + } else { + parsedCommands + } + val commands = if (hasCompletedCommand || hasTakeScreenshotCommand) { + commandsBeforeCompletion + } else { + commandsBeforeCompletion + Command.TakeScreenshot + } val commandsToExecute = commands.mapIndexedNotNull { index, command -> when { + command is Command.Completed -> null command is Command.Retrieve -> null index < incrementalCommandCount && command !is Command.TakeScreenshot -> null else -> command } } - if (hasTakeScreenshotCommand) { + if (!hasCompletedCommand) { pendingRetrievedInfoForNextScreenshot = buildRetrievedInfoForNextScreenshot(commands) } @@ -2293,7 +2310,7 @@ private fun processCommands(text: String) { commands = commands ) _commandExecutionStatus.value = PhotoReasoningCommandStateUpdater.buildDetectedStatus( - commandBatch.commandDescriptions + commands.joinToString("; ") { it.toString() } ) } @@ -2324,12 +2341,8 @@ private fun processCommands(text: String) { } } - // Toast anzeigen wenn kein takeScreenshot Command gefunden wurde - if (!hasTakeScreenshotCommand && !text.contains("takeScreenshot()", ignoreCase = true)) { - val context = MainActivity.getInstance() - if (context != null) { - PhotoReasoningCommandUiNotifier.showStoppedByAi(context) - } + if (hasCompletedCommand) { + _commandExecutionStatus.value = "Task marked completed by AI." } } catch (e: Exception) { diff --git a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt index f82628f..fdf2717 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt @@ -8,6 +8,7 @@ sealed class Command { data class LongClickButton(val buttonText: String) : Command() data class TapCoordinates(val x: String, val y: String) : Command() object TakeScreenshot : Command() + object Completed : Command() data class Wait(val seconds: Long) : Command() object PressHomeButton : Command() object PressBackButton : Command() diff --git a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt index 62c920c..79cc0b8 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt @@ -7,11 +7,14 @@ import android.util.Log */ object CommandParser { private const val TAG = "CommandParser" - private val SINGLE_INSTANCE_COMMAND_TYPES = setOf(CommandTypeEnum.TAKE_SCREENSHOT) + private val SINGLE_INSTANCE_COMMAND_TYPES = setOf( + CommandTypeEnum.TAKE_SCREENSHOT, + CommandTypeEnum.COMPLETED + ) // Enum to represent different command types private enum class CommandTypeEnum { - CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, WAIT, PRESS_HOME, PRESS_BACK, + CLICK_BUTTON, LONG_CLICK_BUTTON, TAP_COORDINATES, TAKE_SCREENSHOT, COMPLETED, WAIT, PRESS_HOME, PRESS_BACK, SHOW_RECENT_APPS, SCROLL_DOWN, SCROLL_UP, SCROLL_LEFT, SCROLL_RIGHT, SCROLL_DOWN_FROM_COORDINATES, SCROLL_UP_FROM_COORDINATES, SCROLL_LEFT_FROM_COORDINATES, SCROLL_RIGHT_FROM_COORDINATES, @@ -53,8 +56,9 @@ object CommandParser { // Tap coordinates patterns PatternInfo("tapCoords1", Regex("(?i)\\btapAtCoordinates\\(\\s*([\\d\\.%]+)\\s*,\\s*([\\d\\.%]+)\\s*\\)"), { match -> Command.TapCoordinates(match.groupValues[1], match.groupValues[2]) }, CommandTypeEnum.TAP_COORDINATES), - // Screenshot and wait patterns + // Screenshot, completion and wait patterns PatternInfo("screenshot1", Regex("(?i)\\btakeScreenshot\\(\\)"), { Command.TakeScreenshot }, CommandTypeEnum.TAKE_SCREENSHOT), + PatternInfo("completed1", Regex("(?i)\\bcompleted\\(\\)"), { Command.Completed }, CommandTypeEnum.COMPLETED), PatternInfo("wait1", Regex("(?i)\\bWait\\(\\s*(\\d+)\\s*\\)"), { match -> Command.Wait(match.groupValues[1].toLong()) }, CommandTypeEnum.WAIT), // Home button patterns @@ -151,6 +155,7 @@ object CommandParser { is Command.LongClickButton -> Log.d(TAG, "Command details: LongClickButton(\"${command.buttonText}\")") is Command.TapCoordinates -> Log.d(TAG, "Command details: TapCoordinates(${command.x}, ${command.y})") is Command.TakeScreenshot -> Log.d(TAG, "Command details: TakeScreenshot") + is Command.Completed -> Log.d(TAG, "Command details: Completed") is Command.Wait -> Log.d(TAG, "Command details: Wait(${command.seconds})") is Command.PressHomeButton -> Log.d(TAG, "Command details: PressHomeButton") is Command.PressBackButton -> Log.d(TAG, "Command details: PressBackButton") diff --git a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt index 4ad868c..f3e2df9 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt @@ -14,7 +14,7 @@ object SystemMessagePreferences { private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag // Content from pasted_content.txt - private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and do not use takeScreenshot() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()"""" + private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "Wait(seconds)" "Termux("command")" "completed()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. If you have questions, open Screen Operator, ask your question(s), and use completed() until you receive an answer. Retrieve information using "retrieve("sample")" if some is passed to your task. After each message, you will see the screen with additional information about it. Say "completed()" when the task is finished.""" private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) /** diff --git a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt index 361b894..4b26399 100644 --- a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt +++ b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt @@ -78,4 +78,23 @@ class CommandParserTest { assertEquals(7L, (wait as Command.Wait).seconds) assertTrue(commands[1] is Command.TakeScreenshot) } + + @Test + fun parseCommands_extractsCompletedCommand() { + val commands = CommandParser.parseCommands("completed()", clearBuffer = true) + + assertEquals(1, commands.size) + assertTrue(commands.first() is Command.Completed) + } + + @Test + fun parseCommands_keepsSingleCompletedCommandInstance() { + val commands = CommandParser.parseCommands( + "completed() completed()", + clearBuffer = true + ) + + assertEquals(1, commands.count { it is Command.Completed }) + } + }