Skip to content

Commit

Permalink
refactor(ai): Update extractTextWithAi to use Maestro Cloud endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
luistak committed Jan 29, 2025
1 parent a6d0e57 commit 79307cb
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 46 deletions.
45 changes: 6 additions & 39 deletions maestro-ai/src/main/java/maestro/ai/Prediction.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package maestro.ai
import kotlinx.serialization.Serializable
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.jsonObject
import maestro.ai.cloud.ApiClient
import maestro.ai.openai.OpenAI

@Serializable
Expand Down Expand Up @@ -227,47 +228,13 @@ object Prediction {
}

suspend fun extractText(
aiClient: AI,
screen: ByteArray,
apiKey: String,
query: String,
screen: ByteArray,
): String {
val prompt = buildString {
append("What text on the screen matches the following query: $query")

append(
"""
|
|RULES:
|* Provide response as a valid JSON, with structure described below.
""".trimMargin("|")
)
val client = ApiClient()
val response = client.extractTextWithAi(apiKey, query, screen)

append(
"""
|
|* You must provide result as a valid JSON object, matching this structure:
|
| {
| "text": <string>
| }
|
|DO NOT output any other information in the JSON object.
""".trimMargin("|")
)
}

val aiResponse = aiClient.chatCompletion(
prompt,
model = aiClient.defaultModel,
maxTokens = 4096,
identifier = "perform-assertion",
imageDetail = "high",
images = listOf(screen),
jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(extractTextSchema).jsonObject else null,
)

val response = json.decodeFromString<ExtractTextResponse>(aiResponse.response)
return response.text ?: ""
return response.text
}

}
86 changes: 86 additions & 0 deletions maestro-ai/src/main/java/maestro/ai/cloud/ApiClient.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package maestro.ai.cloud

import io.ktor.client.*
import io.ktor.client.plugins.*
import io.ktor.client.plugins.contentnegotiation.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import io.ktor.http.*
import kotlinx.serialization.Serializable
import kotlinx.serialization.SerializationException
import kotlinx.serialization.encodeToString
import kotlinx.serialization.json.Json
import maestro.ai.openai.OpenAI
import org.slf4j.LoggerFactory

private val logger = LoggerFactory.getLogger(OpenAI::class.java)

@Serializable
data class ExtractTextWithAiRequest(
val query: String,
val screen: ByteArray,
)

@Serializable
data class ExtractTextWithAiResponse(
val text: String,
)

class ApiClient {
private val baseUrl by lazy {
System.getenv("MAESTRO_CLOUD_API_URL") ?: "https://api.copilot.mobile.dev"
}

private val json = Json { ignoreUnknownKeys = true }

val httpClient = HttpClient {
install(ContentNegotiation) {
Json {
ignoreUnknownKeys = true
}
}

install(HttpTimeout) {
connectTimeoutMillis = 10000
socketTimeoutMillis = 60000
requestTimeoutMillis = 60000
}
}

suspend fun extractTextWithAi(
apiKey: String,
query: String,
screen: ByteArray,
): ExtractTextWithAiResponse {
val url = "$baseUrl/v2/extract-text"

println(url)

val response = try {
val httpResponse = httpClient.post(url) {
headers {
append(HttpHeaders.Authorization, "Bearer $apiKey")
append(HttpHeaders.ContentType, ContentType.Application.Json.toString()) // Explicitly set JSON content type
}
setBody(json.encodeToString(ExtractTextWithAiRequest(query, screen)))
}

val body = httpResponse.bodyAsText()
if (!httpResponse.status.isSuccess()) {
logger.error("Failed to complete request to OpenAI: URL: $url ${httpResponse.status}, $body")
throw Exception("Failed to complete request to OpenAI URL: $url: ${httpResponse.status}, $body")
}

json.decodeFromString<ExtractTextWithAiResponse>(body)
} catch (e: SerializationException) {
logger.error("Failed to parse response from OpenAI", e)
throw e
} catch (e: Exception) {
logger.error("Failed to complete request to OpenAI", e)
throw e
}

return response
}

}
2 changes: 2 additions & 0 deletions maestro-client/src/main/java/maestro/Errors.kt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ sealed class MaestroException(override val message: String) : RuntimeException(m

class AINotAvailable(message: String) : MaestroException(message)

class CloudApiKeyNotAvailable(message: String) : MaestroException(message)

class DestinationIsNotWritable(message: String) : MaestroException(message)

class UnableToCopyTextFromElement(message: String): MaestroException(message)
Expand Down
21 changes: 14 additions & 7 deletions maestro-orchestra/src/main/java/maestro/orchestra/Orchestra.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,15 @@
package maestro.orchestra

import kotlinx.coroutines.runBlocking
import maestro.*
import maestro.Driver
import maestro.ElementFilter
import maestro.Filters
import maestro.Filters.asFilter
import maestro.FindElementResult
import maestro.Maestro
import maestro.MaestroException
import maestro.ScreenRecording
import maestro.ViewHierarchy
import maestro.ai.AI
import maestro.ai.AI.Companion.AI_KEY_ENV_VAR
import maestro.ai.Defect
Expand All @@ -37,6 +44,7 @@ import maestro.orchestra.filter.TraitFilters
import maestro.orchestra.geo.Traveller
import maestro.orchestra.util.Env.evaluateScripts
import maestro.orchestra.yaml.YamlCommandReader
import maestro.toSwipeDirection
import maestro.utils.Insight
import maestro.utils.Insights
import maestro.utils.MaestroTimer
Expand Down Expand Up @@ -415,18 +423,17 @@ class Orchestra(
}

private fun extractTextWithAICommand(command: ExtractTextWithAICommand): Boolean = runBlocking {
// Extract text from the screen using AI
if (ai == null) {
throw MaestroException.AINotAvailable("AI client is not available. Did you export $AI_KEY_ENV_VAR?")
val apiKey = System.getenv("MAESTRO_CLOUD_API_KEY")
if (apiKey.isNullOrEmpty()) {
throw MaestroException.CloudApiKeyNotAvailable("`MAESTRO_CLOUD_API_KEY` is not available. Did you export MAESTRO_CLOUD_API_KEY?")
}

val imageData = Buffer()
maestro.takeScreenshot(imageData, compressed = false)

val text = Prediction.extractText(
aiClient = ai,
screen = imageData.copy().readByteArray(),
apiKey = apiKey,
query = command.query,
screen = imageData.copy().readByteArray(),
)

jsEngine.putEnv(command.outputVariable, text)
Expand Down

0 comments on commit 79307cb

Please sign in to comment.