Scraping
If you want to extract data from websites this is the part of the documentation you where looking for.
💡 Let's assume a pretty basic scenario. We want to make a request to a github profile page and extract the nickname of the profiles owner as well as getting the names of them pinned repositories and some other informations that demonstrates the usage and power of skrape{it}'s DSL. 💪
Documentation by Example
All of the interesting parts are marked with an (ℹ️) and explained at the bottom of the code sample.
import it.skrape.core.htmlDocument
import it.skrape.selects.and
import it.skrape.selects.eachImage
import it.skrape.selects.eachText
import it.skrape.selects.html5.a
import it.skrape.selects.html5.div
import it.skrape.selects.html5.p
import it.skrape.selects.html5.span
import org.junit.jupiter.api.Test
// just some object where we will store our scraped data
data class MyExtractedData(
var httpMessage: String = "",
var userName: String = "",
var repositoryNames: List<String> = emptyList(),
var theThirdRepositoriesName: String = "",
var firstThreeHrefs: List<String> = emptyList(),
var overviewLink: String = "",
var firstThreeImageSources: List<String> = emptyList(),
var title: String = "",
var starsCount: String = ""
)
fun main() {
val extracted = skrape { // 1️⃣
url = "https://github.com/skrapeit"
extractIt<MyExtractedData> { it ->
it.httpMessage = status { message } // 2️⃣
htmlDocument { // 3️⃣
relaxed = true // 4️⃣
it.userName = ".h-card .p-nickname" { findFirst { text } } // 5️⃣
val repositories = span(".repo") { findAll { this }} // 6️⃣
println("hello world") // 7️⃣
it.repositoryNames = repositories.filter { it.text.contains("skrape") }.eachText // 8️⃣
it.theThirdRepositoriesName = span(".repo") {
2 { text } // 9️⃣
}
it.firstThreeImageSources = findAll { eachImage.map { image -> image.value } }.take(3) // 1️⃣0️⃣
it.firstThreeHrefs = findAll { eachHref }.take(3) // 1️⃣1️⃣
it.overviewLink = findAll { eachLink["Overview"] ?: "not found" } // 1️⃣2️⃣
it.title = titleText // 1️⃣3️⃣
// *️⃣
it.starsCount = div { // 1️⃣5️⃣
withClass = "pinned-item-list-item"
findFirst {
p { // 1️⃣6️⃣
findSecond {
a {
// 1️⃣7️⃣
withClass = "pinned-item-meta" and "muted-link" // 1️⃣8️⃣
withAttribute = "href" to "/skrapeit/skrape.it/stargazers" // 1️⃣9️⃣
findFirst {
ownText
}
}
}
}
}
}
}
}
}
println(extracted)
}
> hello world
> MyExtractedData(httpMessage=OK, userName=skrapeit, repositoryNames=[skrape.it, skrapeit-ktor-extension, skrapeit-mockmvc-extension, skrapeit-docs], theThirdRepositoriesName=skrapeit-mockmvc-extension, firstThreeHrefs=[https://github.githubassets.com, https://avatars0.githubusercontent.com, https://avatars1.githubusercontent.com], overviewLink=/skrapeit, firstThreeImageSources=[https://github.githubassets.com/images/spinners/octocat-spinner-128.gif, https://avatars0.githubusercontent.com/u/46688980?s=88&u=c99dfeadc23ab06f4c428ffd4330e95f0b32d2cb&v=4, https://avatars0.githubusercontent.com/u/46688980?s=460&u=c99dfeadc23ab06f4c428ffd4330e95f0b32d2cb&v=4], title=skrapeit · GitHub, starsCount=119)
Last updated
Was this helpful?