Dies ist eine alte Version des Dokuments!
Siehe auch ElasticSearch (z.B Scripte, API, etc. sind identisch). Wazuh verwendet OpenSearch.
Run a local cluster
docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0
Create a python script
from opensearchpy import OpenSearch client = OpenSearch( hosts = [{"host": "localhost", "port": 9200}], http_auth = ("admin", "admin"), use_ssl = True, verify_certs = False, ssl_assert_hostname = False, ssl_show_warn = False, ) client.info()
Get some random data for e.g wikipedia-movie-plots. Read the data into a pandas array.
import pandas as pd df = ( pd.read_csv("wiki_movie_plots_deduped.csv") .dropna() .sample(5000, random_state=42) .reset_index(drop=True) )
Create an index
body = { "mappings":{ "properties": { "title": {"type": "text", "analyzer": "english"}, "ethnicity": {"type": "text", "analyzer": "standard"}, "director": {"type": "text", "analyzer": "standard"}, "cast": {"type": "text", "analyzer": "standard"}, "genre": {"type": "text", "analyzer": "standard"}, "plot": {"type": "text", "analyzer": "english"}, "year": {"type": "integer"}, "wiki_page": {"type": "keyword"} } } } response = client.indices.create("movies", body=body)
Push the data into the index
for i, row in df.iterrows(): body = { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"] } client.index(index="movies", id=i, body=body)
More data in a bulk
from opensearchpy.helpers import bulk bulk_data = [] for i,row in df.iterrows(): bulk_data.append( { "_index": "movies", "_id": i, "_source": { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"], } } ) bulk(client, bulk_data)
Count the inserted data
client.indices.refresh(index="movies") client.cat.count(index="movies", format="json")
Search the data
resp = client.search( index="movies", body={ "query": { "bool": { "must": { "match_phrase": { "cast": "jack nicholson", } }, "filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}}, }, }, } ) resp
Remove documents
client.delete(index="movies", id="2500")
Delete the index
client.indices.delete(index='movies')
Run as root
#!/usr/bin/env bash set -Eeuo pipefail trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR if [[ "${EUID}" -ne 0 ]]; then echo "Bitte mit sudo/root ausführen." exit 1 fi APP_ROOT="/opt/at-search" NUTCH_VERSION="1.22" OPENSEARCH_VERSION="1.3.20" JAVA_PKG="openjdk-11-jre-headless" OS_UID="1000" OS_GID="1000" echo "[1/13] Systempakete installieren" export DEBIAN_FRONTEND=noninteractive apt-get update apt-get install -y \ curl \ wget \ jq \ tar \ gzip \ unzip \ ca-certificates \ gnupg \ lsb-release \ software-properties-common \ docker.io \ docker-compose \ "${JAVA_PKG}" systemctl enable docker systemctl restart docker echo "[2/13] Host-Kernel-Setting für OpenSearch setzen" if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf else echo 'vm.max_map_count=262144' >> /etc/sysctl.conf fi sysctl -w vm.max_map_count=262144 >/dev/null echo "[3/13] Verzeichnisstruktur anlegen" mkdir -p \ "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads} \ "${APP_ROOT}/opensearch/data" echo "[4/13] Rechte für OpenSearch-Datenverzeichnis korrekt setzen" chown -R "${OS_UID}:${OS_GID}" "${APP_ROOT}/opensearch/data" chmod 775 "${APP_ROOT}/opensearch/data" echo "[5/13] Docker-Compose für OpenSearch schreiben" cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF version: "3.8" services: opensearch: image: opensearchproject/opensearch:${OPENSEARCH_VERSION} container_name: at-opensearch restart: unless-stopped environment: - cluster.name=at-search-cluster - node.name=at-opensearch - discovery.type=single-node - bootstrap.memory_lock=true - OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g - DISABLE_INSTALL_DEMO_CONFIG=true - DISABLE_SECURITY_PLUGIN=true ulimits: memlock: soft: -1 hard: -1 nofile: soft: 65536 hard: 65536 volumes: - ${APP_ROOT}/opensearch/data:/usr/share/opensearch/data ports: - "127.0.0.1:9200:9200" - "127.0.0.1:9600:9600" EOF echo "[6/13] OpenSearch starten" cd "${APP_ROOT}/opensearch" docker-compose up -d echo "Warte auf OpenSearch ..." ok=0 for i in $(seq 1 120); do if curl -fsS "http://127.0.0.1:9200" >/dev/null 2>&1; then ok=1 break fi sleep 2 done if [[ "${ok}" != "1" ]]; then echo echo "[FEHLER] OpenSearch kam nicht hoch. Letzte Logs:" docker-compose logs --tail=200 opensearch || true exit 1 fi echo "[7/13] Basis-Index anlegen" curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \ -H 'Content-Type: application/json' \ -d '{ "settings": { "number_of_shards": 1, "number_of_replicas": 0 }, "mappings": { "properties": { "url": { "type": "keyword" }, "host": { "type": "keyword" }, "site": { "type": "keyword" }, "title": { "type": "text" }, "content": { "type": "text" }, "type": { "type": "keyword" }, "digest": { "type": "keyword" }, "segment": { "type": "keyword" }, "tstamp": { "type": "date", "format": "strict_date_optional_time||epoch_millis" } } } }' >/dev/null || true echo "[8/13] Apache Nutch herunterladen" cd "${APP_ROOT}/downloads" if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \ "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz" fi rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}" ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch" chmod +x "${APP_ROOT}/nutch/bin/nutch" echo "[9/13] JAVA_HOME setzen" JAVA_HOME_PATH="$(readlink -f /usr/bin/java | sed 's:bin/java::')" cat > /etc/profile.d/at-search-java.sh <<EOF export JAVA_HOME=${JAVA_HOME_PATH} EOF chmod 0644 /etc/profile.d/at-search-java.sh export JAVA_HOME="${JAVA_HOME_PATH}" echo "[10/13] Nutch konfigurieren" cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF' <?xml version="1.0" encoding="UTF-8"?> <configuration> <property> <name>http.agent.name</name> <value>ATSearchBot</value> </property> <property> <name>http.robots.agents</name> <value>ATSearchBot,*</value> </property> <property> <name>plugin.includes</name> <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value> </property> <property> <name>fetcher.threads.fetch</name> <value>10</value> </property> <property> <name>http.content.limit</name> <value>1048576</value> </property> <property> <name>db.ignore.external.links</name> <value>false</value> </property> <property> <name>db.ignore.internal.links</name> <value>false</value> </property> <property> <name>db.ignore.external.links.mode</name> <value>byHost</value> </property> <property> <name>generate.max.per.host</name> <value>25</value> </property> <property> <name>generate.max.per.domain</name> <value>100</value> </property> <property> <name>generate.count.mode</name> <value>byDomain</value> </property> <property> <name>fetcher.parse</name> <value>true</value> </property> <property> <name>parser.character.encoding.default</name> <value>UTF-8</value> </property> </configuration> EOF cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF' +^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$ -^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$ -^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$ -. EOF cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF' <?xml version="1.0" encoding="UTF-8"?> <writers xmlns="http://lucene.apache.org/nutch" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd"> <writer id="indexer_opensearch_1x_1" class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter"> <parameters> <param name="host" value="localhost"/> <param name="port" value="9200"/> <param name="scheme" value="http"/> <param name="index" value="at_web"/> <param name="max.bulk.docs" value="250"/> <param name="max.bulk.size" value="2500500"/> <param name="exponential.backoff.millis" value="100"/> <param name="exponential.backoff.retries" value="10"/> <param name="bulk.close.timeout" value="600"/> </parameters> <mapping> <copy> <field source="title" dest="title,search"/> </copy> <rename /> <remove> <field source="segment"/> </remove> </mapping> </writer> </writers> EOF echo "[11/13] Österreich-Seeds schreiben" cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF' https://www.oesterreich.gv.at/ https://www.parlament.gv.at/ https://www.orf.at/ https://www.derstandard.at/ https://www.wien.gv.at/ https://www.arbeiterkammer.at/ https://www.wko.at/ https://www.ams.at/ https://www.univie.ac.at/ https://www.tuwien.at/ EOF echo "[12/13] Helper-Skripte schreiben" cat > "${APP_ROOT}/bin/run-crawl.sh" <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail export JAVA_HOME="${JAVA_HOME:-$(readlink -f /usr/bin/java | sed 's:bin/java::')}" export NUTCH_HOME="/opt/at-search/nutch" export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" ROUNDS="${1:-3}" CRAWL_DIR="/opt/at-search/crawl" SEED_DIR="/opt/at-search/seeds" mkdir -p "${CRAWL_DIR}" cd "${NUTCH_HOME}" exec "${NUTCH_HOME}/bin/crawl" -i -s "${SEED_DIR}" "${CRAWL_DIR}" "${ROUNDS}" EOF chmod +x "${APP_ROOT}/bin/run-crawl.sh" cat > "${APP_ROOT}/bin/search.sh" <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail if [[ $# -lt 1 ]]; then echo "Nutzung: $0 \"suchbegriff\"" exit 1 fi QUERY="$*" BODY="$(jq -n --arg q "$QUERY" '{ size: 10, query: { multi_match: { query: $q, fields: ["title^3","content","site^2","host"] } }, _source: ["title","url","host","site"] }')" curl -fsS "http://127.0.0.1:9200/at_web/_search" \ -H 'Content-Type: application/json' \ -d "${BODY}" | jq . EOF chmod +x "${APP_ROOT}/bin/search.sh" cat > /usr/local/bin/at-search-crawl <<'EOF' #!/usr/bin/env bash exec /opt/at-search/bin/run-crawl.sh "$@" EOF chmod +x /usr/local/bin/at-search-crawl cat > /usr/local/bin/at-search-query <<'EOF' #!/usr/bin/env bash exec /opt/at-search/bin/search.sh "$@" EOF chmod +x /usr/local/bin/at-search-query echo "[13/13] Fertig" cat <<'EOF' ==== INSTALLATION FERTIG ==== Jetzt genau das ausführen: 1) curl http://127.0.0.1:9200 2) at-search-crawl 2 3) curl http://127.0.0.1:9200/at_web/_count | jq 4) at-search-query "wien" Wichtige Pfade: /opt/at-search/nutch /opt/at-search/seeds/seed.txt /opt/at-search/crawl /opt/at-search/opensearch/docker-compose.yml Logs: cd /opt/at-search/opensearch && docker-compose logs -f EOF
curl http://127.0.0.1:9200 sudo at-search-crawl 2 curl http://127.0.0.1:9200/at_web/_count | jq sudo at-search-query "wien"